/* * Copyright (C) 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.streamhtmlparser.util; import com.google.common.collect.ImmutableSortedSet; import java.util.Set; import java.util.regex.Pattern; import java.util.regex.Matcher; /** * Utility functions for HTML and Javascript that are most likely * not interesting to users outside this package. * * <p>The <code>HtmlParser</code> will be open-sourced hence we took the * decision to keep these utilities in this package as well as not to * leverage others that may exist in the <code>google3</code> code base. * * <p>The functionality exposed is designed to be 100% compatible with * the corresponding logic in the C-version of the HtmlParser as such * we are particularly concerned with cross-language compatibility. * * <p>Note: The words {@code Javascript} and {@code ECMAScript} are used * interchangeably unless otherwise noted. */ public final class HtmlUtils { /** * static utility class */ private HtmlUtils() { } // COV_NF_LINE /** * Indicates the type of content contained in the {@code content} HTML * attribute of the {@code meta} HTML tag. Used by * {@link HtmlUtils#parseContentAttributeForUrl(String)}. * <p>The values are: * <ul> * <li>{@code NONE} if it does not contain a URL in the expected format. * <li>{@code URL_START} if it contains a URL but hasn't seen any of * its contents. * <li>{@code URL} if it contains a URL and has seen at least some of * its contents. * </ul> */ public enum META_REDIRECT_TYPE { NONE, URL_START, URL } /** * A regular expression matching the format of a {@code content} attribute * that contains a URL. Used by {@link #parseContentAttributeForUrl}. */ private static final String META_REDIRECT_REGEX = "^\\s*\\d*\\s*;\\s*URL\\s*=\\s*[\'\"]?"; // Safe for use by concurrent threads so we compile once. private static final Pattern META_REDIRECT_PATTERN = Pattern.compile(META_REDIRECT_REGEX, Pattern.CASE_INSENSITIVE); /** * Set of keywords that can precede a regular expression literal. Taken from: * <a href="http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html"> * Language Syntax</a> * * <p>The token {@code void} was added to the list. Several keywords are * defined in Ecmascript 4 not Ecmascript 3. However, to keep the logic * simple we do not differentiate on the version and bundle them all together. */ private static final Set<String> REGEXP_TOKEN_PREFIXS = ImmutableSortedSet.of( "abstract", "break", "case", "catch", "class", "const", "continue", "debugger", "default", "delete", "do", "else", "enum", "eval", "export", "extends", "field", "final", "finally", "for", "function", "goto", "if", "implements", "import", "in", "instanceof", "native", "new", "package", "private", "protected", "public", "return", "static", "switch", "synchronized", "throw", "throws", "transient", "try", "typeof", "var", "void", "volatile", "while", "with"); /** * Set of all HTML attributes which expect a URI (as the value). * <a href="http://www.w3.org/TR/html4/index/attributes.html">Index of Attributes</a> */ private static final Set<String> ATTRIBUTE_EXPECTS_URI = ImmutableSortedSet.of( "action", "archive", "background", "cite", "classid", "codebase", "data", "dynsrc", "href", "longdesc", "src", "usemap"); /** * Set of {@code Character}s considered whitespace in Javascript. * See {@link #isJavascriptWhitespace(char)} */ private static final Set<Character> JAVASCRIPT_WHITESPACE = ImmutableSortedSet.of( '\u0009', /* Tab \t */ '\n', /* Line-Feed 0x0A */ '\u000B', /* Vertical Tab 0x0B */ '\u000C', /* Form Feed \f */ '\r', /* Carriage Return 0x0D */ ' ', /* Space 0x20 */ '\u00A0', /* Non-breaking space 0xA0 */ '\u2028', /* Line separator */ '\u2029'); /* Paragraph separator */ /** * Set of {@code Character}s considered whitespace in HTML. * See {@link #isHtmlSpace(char)} */ private static final Set<Character> HTML_WHITESPACE = ImmutableSortedSet.of( ' ', '\t', '\n', '\r', '\u200B'); /** * Determines if the HTML attribute specified expects javascript * for its value. Such is the case for example with the {@code onclick} * attribute. * * <p>Currently returns {@code true} for any attribute name that starts * with "on" which is not exactly correct but we trust a developer to * not use non-spec compliant attribute names (e.g. onbogus). * * @param attribute the name of an HTML attribute * @return {@code false} if the input is null or is not an attribute * that expects javascript code; {@code true} */ public static boolean isAttributeJavascript(String attribute) { return ((attribute != null) && attribute.startsWith("on")); } /** * Determines if the HTML attribute specified expects a {@code style} * for its value. Currently this is only true for the {@code style} * HTML attribute. * * @param attribute the name of an HTML attribute * @return {@code true} iff the attribute name is one that expects a * style for a value; otherwise {@code false} */ public static boolean isAttributeStyle(String attribute) { return "style".equals(attribute); } /** * Determines if the HTML attribute specified expects a {@code URI} * for its value. For example, both {@code href} and {@code src} * expect a {@code URI} but {@code style} does not. Returns * {@code false} if the attribute given was {@code null}. * * @param attribute the name of an HTML attribute * @return {@code true} if the attribute name is one that expects * a URI for a value; otherwise {@code null} * * @see #ATTRIBUTE_EXPECTS_URI */ public static boolean isAttributeUri(String attribute) { return ATTRIBUTE_EXPECTS_URI.contains(attribute); } /** * Determines if the specified character is an HTML whitespace character. * A character is an HTML whitespace character if and only if it is one * of the characters below. * <ul> * <li>A <code>Space</code> character * <li>A <code>Tab</code> character * <li>A <code>Line feed</code> character * <li>A <code>Carriage Return</code> character * <li>A <code>Zero-Width Space</code> character * </ul> * * Note: The list includes the zero-width space (<code>&#x200B;</code>) * which is not included in the C version. * * @param chr the {@code char} to check * @return {@code true} if the character is an HTML whitespace character * * <a href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">White space</a> */ public static boolean isHtmlSpace(char chr) { return HTML_WHITESPACE.contains(chr); } /** * Determines if the specified character is an ECMAScript whitespace or line * terminator character. A character is a whitespace or line terminator if * and only if it is one of the characters below: * <ul> * <li>A white-space character (<code>Tab</code>, <code>Vertical Tab</code>, * <code>Form Feed</code>, <code>Space</code>, * <code>No-break space</code>) * <li>A line terminator character (<code>Line Feed</code>, * <code>Carriage Return</code>, <code>Line separator</code>, * <code>Paragraph Separator</code>). * </ul> * * <p>Encompasses the characters in sections 7.2 and 7.3 of ECMAScript 3, in * particular, this list is quite different from that in * <code>Character.isWhitespace</code>. * <a href="http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf"> * ECMAScript Language Specification</a> * * @param chr the {@code char} to check * @return {@code true} or {@code false} * */ public static boolean isJavascriptWhitespace(char chr) { return JAVASCRIPT_WHITESPACE.contains(chr); } /** * Determines if the specified character is a valid character in an * ECMAScript identifier. This determination is currently not exact, * in particular: * <ul> * <li>It does not accept Unicode letters, only ASCII ones. * <li>It does not distinguish between the first character of an identifier * (which cannot contain numbers) and subsequent characters. * </li> * </ul> * * We are considering leveraging <code>Character.isJavaIdentifierStart</code> * and <code>Character.isJavaIdentifierPart</code> given that Java * and Javascript follow similar identifier naming rules but we lose * compatibility with the C-version. * * @param chr {@code char} to check * @return {@code true} if the {@code chr} is a Javascript whitespace * character; otherwise {@code false} */ public static boolean isJavascriptIdentifier(char chr) { return ((chr >= 'a' && chr <= 'z') || (chr >= 'A' && chr <= 'Z') || (chr >= '0' && chr <= '9') || chr == '_' || chr == '$'); } /** * Determines if the input token provided is a valid token prefix to a * javascript regular expression. The token argument is compared against * a {@code Set} of identifiers that can precede a regular expression in the * javascript grammar, and returns {@code true} if the provided * {@code String} is in that {@code Set}. * * @param input the {@code String} token to check * @return {@code true} iff the token is a valid prefix of a regexp */ public static boolean isJavascriptRegexpPrefix(String input) { return REGEXP_TOKEN_PREFIXS.contains(input); } /** * Encodes the specified character using Ascii for convenient insertion into * a single-quote enclosed {@code String}. Printable characters * are returned as-is. Carriage Return, Line Feed, Horizontal Tab, * back-slash and single quote are all backslash-escaped. All other characters * are returned hex-encoded. * * @param chr {@code char} to encode * @return an Ascii-friendly encoding of the given {@code char} */ public static String encodeCharForAscii(char chr) { if (chr == '\'') { return "\\'"; } else if (chr == '\\') { return "\\\\"; } else if (chr >= 32 && chr <= 126) { return String.format("%c", chr); } else if (chr == '\n') { return "\\n"; } else if (chr == '\r') { return "\\r"; } else if (chr == '\t') { return "\\t"; } else { // Cannot apply a precision specifier for integral types. Specifying // 0-padded hex-encoding with minimum width of two. return String.format("\\u%04x", (int)chr); } } /** * Parses the given {@code String} to determine if it contains a URL in the * format followed by the {@code content} attribute of the {@code meta} * HTML tag. * * <p>This function expects to receive the value of the {@code content} HTML * attribute. This attribute takes on different meanings depending on the * value of the {@code http-equiv} HTML attribute of the same {@code meta} * tag. Since we may not have access to the {@code http-equiv} attribute, * we instead rely on parsing the given value to determine if it contains * a URL. * * The specification of the {@code meta} HTML tag can be found in: * http://dev.w3.org/html5/spec/Overview.html#attr-meta-http-equiv-refresh * * <p>We return {@link HtmlUtils.META_REDIRECT_TYPE} indicating whether the * value contains a URL and whether we are at the start of the URL or past * the start. We are at the start of the URL if and only if one of the two * conditions below is true: * <ul> * <li>The given input does not contain any characters from the URL proper. * Example "5; URL=". * <li>The given input only contains the optional leading single or double * quote leading the URL. Example "5; URL='". * </li> * </ul> * * <p>Examples: * <ul> * <li> Example of a complete {@code meta} tag where the {@code content} * attribute contains a URL [we are not at the start of the URL]: * <pre> * <meta http-equiv="refresh" content="5; URL=http://www.google.com"> * </pre> * <li> Example of a complete {@code meta} tag where the {@code content} * attribute contains a URL [we are at the start of the URL]: * <pre> * <meta http-equiv="refresh" content="5; URL="> * </pre> * <li>Example of a complete {@code meta} tag where the {@code content} * attribute does not contain a URL: * <pre> * <meta http-equiv="content-type" content="text/html"> * </pre> * </ul> * * @param value {@code String} to parse * @return {@link HtmlUtils.META_REDIRECT_TYPE} indicating the presence * of a URL in the given value */ public static META_REDIRECT_TYPE parseContentAttributeForUrl(String value) { if (value == null) return META_REDIRECT_TYPE.NONE; Matcher matcher = META_REDIRECT_PATTERN.matcher(value); if (!matcher.find()) return META_REDIRECT_TYPE.NONE; // We have more content. if (value.length() > matcher.end()) return META_REDIRECT_TYPE.URL; return META_REDIRECT_TYPE.URL_START; } }