streamhtmlparser/util/HtmlUtils.java

/*
 * Copyright (C) 2010 Google Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.google.streamhtmlparser.util;

import com.google.common.collect.ImmutableSortedSet;

import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.Matcher;

/**
 * Utility functions for HTML and Javascript that are most likely
 * not interesting to users outside this package.
 *
 * <p>The <code>HtmlParser</code> will be open-sourced hence we took the
 * decision to keep these utilities in this package as well as not to
 * leverage others that may exist in the <code>google3</code> code base.
 *
 * <p>The functionality exposed is designed to be 100% compatible with
 * the corresponding logic in the C-version of the HtmlParser as such
 * we are particularly concerned with cross-language compatibility.
 *
 * <p>Note: The words {@code Javascript} and {@code ECMAScript} are used
 * interchangeably unless otherwise noted.
 */
public final class HtmlUtils {

  /**
   * static utility class
   */
  private HtmlUtils() {
  }  // COV_NF_LINE

  /**
   * Indicates the type of content contained in the {@code content} HTML
   * attribute of the {@code meta} HTML tag. Used by
   * {@link HtmlUtils#parseContentAttributeForUrl(String)}.
   * <p>The values are:
   * <ul>
   * <li>{@code NONE} if it does not contain a URL in the expected format.
   * <li>{@code URL_START} if it contains a URL but hasn't seen any of
   * its contents.
   * <li>{@code URL} if it contains a URL and has seen at least some of
   * its contents.
   * </ul>
   */
  public enum META_REDIRECT_TYPE {
    NONE,
    URL_START,
    URL
  }

  /**
   * A regular expression matching the format of a {@code content} attribute
   * that contains a URL. Used by {@link #parseContentAttributeForUrl}.
   */
  private static final String META_REDIRECT_REGEX =
      "^\\s*\\d*\\s*;\\s*URL\\s*=\\s*[\'\"]?";

  // Safe for use by concurrent threads so we compile once.
  private static final Pattern META_REDIRECT_PATTERN =
      Pattern.compile(META_REDIRECT_REGEX, Pattern.CASE_INSENSITIVE);

  /**
   * Set of keywords that can precede a regular expression literal. Taken from:
   * <a href="http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html">
   * Language Syntax</a>
   *
   * <p>The token {@code void} was added to the list. Several keywords are
   * defined in Ecmascript 4 not Ecmascript 3. However, to keep the logic
   * simple we do not differentiate on the version and bundle them all together.
   */
  private static final Set<String> REGEXP_TOKEN_PREFIXS =
      ImmutableSortedSet.of(
          "abstract",
          "break",
          "case",
          "catch",
          "class",
          "const",
          "continue",
          "debugger",
          "default",
          "delete",
          "do",
          "else",
          "enum",
          "eval",
          "export",
          "extends",
          "field",
          "final",
          "finally",
          "for",
          "function",
          "goto",
          "if",
          "implements",
          "import",
          "in",
          "instanceof",
          "native",
          "new",
          "package",
          "private",
          "protected",
          "public",
          "return",
          "static",
          "switch",
          "synchronized",
          "throw",
          "throws",
          "transient",
          "try",
          "typeof",
          "var",
          "void",
          "volatile",
          "while",
          "with");

  /**
   * Set of all HTML attributes which expect a URI (as the value).
   * <a href="http://www.w3.org/TR/html4/index/attributes.html">Index of Attributes</a>
   */
  private static final Set<String> ATTRIBUTE_EXPECTS_URI =
      ImmutableSortedSet.of(
          "action",
          "archive",
          "background",
          "cite",
          "classid",
          "codebase",
          "data",
          "dynsrc",
          "href",
          "longdesc",
          "src",
          "usemap");

  /**
   * Set of {@code Character}s considered whitespace in Javascript.
   * See {@link #isJavascriptWhitespace(char)}
   */
  private static final Set<Character> JAVASCRIPT_WHITESPACE =
      ImmutableSortedSet.of(
            '\u0009',         /* Tab \t */
            '\n',             /* Line-Feed 0x0A */
            '\u000B',         /* Vertical Tab 0x0B */
            '\u000C',         /* Form Feed \f */
            '\r',             /* Carriage Return 0x0D */
            ' ',              /* Space 0x20 */
            '\u00A0',         /* Non-breaking space 0xA0 */
            '\u2028',         /* Line separator */
            '\u2029');        /* Paragraph separator */

  /**
  * Set of {@code Character}s considered whitespace in HTML.
  * See {@link #isHtmlSpace(char)}
  */
 private static final Set<Character> HTML_WHITESPACE =
      ImmutableSortedSet.of(
          ' ',
          '\t',
          '\n',
          '\r',
          '\u200B');


  /**
   * Determines if the HTML attribute specified expects javascript
   * for its value. Such is the case for example with the {@code onclick}
   * attribute.
   *
   * <p>Currently returns {@code true} for any attribute name that starts
   * with "on" which is not exactly correct but we trust a developer to
   * not use non-spec compliant attribute names (e.g. onbogus).
   *
   * @param attribute the name of an HTML attribute
   * @return {@code false} if the input is null or is not an attribute
   *         that expects javascript code; {@code true}
   */
  public static boolean isAttributeJavascript(String attribute) {
    return ((attribute != null) && attribute.startsWith("on"));
  }

  /**
   * Determines if the HTML attribute specified expects a {@code style}
   * for its value. Currently this is only true for the {@code style}
   * HTML attribute.
   *
   * @param attribute the name of an HTML attribute
   * @return {@code true} iff the attribute name is one that expects a
   *     style for a value; otherwise {@code false}
   */
  public static boolean isAttributeStyle(String attribute) {
    return "style".equals(attribute);
  }

  /**
   * Determines if the HTML attribute specified expects a {@code URI}
   * for its value. For example, both {@code href} and {@code src}
   * expect a {@code URI} but {@code style} does not. Returns
   * {@code false} if the attribute given was {@code null}.
   *
   * @param attribute the name of an HTML attribute
   * @return {@code true} if the attribute name is one that expects
   *         a URI for a value; otherwise {@code null}
   *
   * @see #ATTRIBUTE_EXPECTS_URI
   */
  public static boolean isAttributeUri(String attribute) {
    return ATTRIBUTE_EXPECTS_URI.contains(attribute);
  }

  /**
   * Determines if the specified character is an HTML whitespace character.
   * A character is an HTML whitespace character if and only if it is one
   * of the characters below.
   * <ul>
   * <li>A <code>Space</code> character
   * <li>A <code>Tab</code> character
   * <li>A <code>Line feed</code> character
   * <li>A <code>Carriage Return</code> character
   * <li>A <code>Zero-Width Space</code> character
   * </ul>
   *
   * Note: The list includes the zero-width space (<code>&amp;#x200B;</code>)
   * which is not included in the C version.
   *
   * @param chr the {@code char} to check
   * @return {@code true} if the character is an HTML whitespace character
   *
   * <a href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">White space</a>
   */
  public static boolean isHtmlSpace(char chr) {
    return HTML_WHITESPACE.contains(chr);
  }

  /**
   * Determines if the specified character is an ECMAScript whitespace or line
   * terminator character. A character is a whitespace or line terminator if
   * and only if it is one of the characters below:
   * <ul>
   * <li>A white-space character (<code>Tab</code>, <code>Vertical Tab</code>,
   *     <code>Form Feed</code>, <code>Space</code>,
   *     <code>No-break space</code>)
   * <li>A line terminator character (<code>Line Feed</code>,
   *     <code>Carriage Return</code>, <code>Line separator</code>,
   *     <code>Paragraph Separator</code>).
   * </ul>
   *
   * <p>Encompasses the characters in sections 7.2 and 7.3 of ECMAScript 3, in
   * particular, this list is quite different from that in
   * <code>Character.isWhitespace</code>.
   * <a href="http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf">
   * ECMAScript Language Specification</a>
   *
   * @param chr the {@code char} to check
   * @return {@code true} or {@code false}
   *
   */
  public static boolean isJavascriptWhitespace(char chr) {
    return JAVASCRIPT_WHITESPACE.contains(chr);
  }

  /**
   * Determines if the specified character is a valid character in an
   * ECMAScript identifier. This determination is currently not exact,
   * in particular:
   * <ul>
   * <li>It does not accept Unicode letters, only ASCII ones.
   * <li>It does not distinguish between the first character of an identifier
   *     (which cannot contain numbers) and subsequent characters.
   * </li>
   * </ul>
   *
   * We are considering leveraging <code>Character.isJavaIdentifierStart</code>
   * and <code>Character.isJavaIdentifierPart</code> given that Java
   * and Javascript follow similar identifier naming rules but we lose
   * compatibility with the C-version.
   *
   * @param chr {@code char} to check
   * @return {@code true} if the {@code chr} is a Javascript whitespace
   *         character; otherwise {@code false}
   */
  public static boolean isJavascriptIdentifier(char chr) {
    return ((chr >= 'a' && chr <= 'z')
        || (chr >= 'A' && chr <= 'Z')
        || (chr >= '0' && chr <= '9')
        || chr == '_' || chr == '$');
  }

  /**
   * Determines if the input token provided is a valid token prefix to a
   * javascript regular expression.  The token argument is compared against
   * a {@code Set} of identifiers that can precede a regular expression in the
   * javascript grammar, and returns {@code true} if the provided
   * {@code String} is in that {@code Set}.
   *
   * @param input the {@code String} token to check
   * @return {@code true} iff the token is a valid prefix of a regexp
   */
  public static boolean isJavascriptRegexpPrefix(String input) {
    return REGEXP_TOKEN_PREFIXS.contains(input);
  }

  /**
   * Encodes the specified character using Ascii for convenient insertion into
   * a single-quote enclosed {@code String}. Printable characters
   * are returned as-is. Carriage Return, Line Feed, Horizontal Tab,
   * back-slash and single quote are all backslash-escaped. All other characters
   * are returned hex-encoded.
   *
   * @param chr {@code char} to encode
   * @return an Ascii-friendly encoding of the given {@code char}
   */
  public static String encodeCharForAscii(char chr) {
    if (chr == '\'') {
      return "\\'";
    } else if (chr == '\\') {
      return "\\\\";
    } else if (chr >= 32 && chr <= 126) {
      return String.format("%c", chr);
    } else if (chr == '\n') {
      return "\\n";
    } else if (chr == '\r') {
      return "\\r";
    } else if (chr == '\t') {
      return "\\t";
    } else {
      // Cannot apply a precision specifier for integral types. Specifying
      // 0-padded hex-encoding with minimum width of two.
      return String.format("\\u%04x", (int)chr);
    }
  }

  /**
   * Parses the given {@code String} to determine if it contains a URL in the
   * format followed by the {@code content} attribute of the {@code meta}
   * HTML tag.
   *
   * <p>This function expects to receive the value of the {@code content} HTML
   * attribute. This attribute takes on different meanings depending on the
   * value of the {@code http-equiv} HTML attribute of the same {@code meta}
   * tag. Since we may not have access to the {@code http-equiv} attribute,
   * we instead rely on parsing the given value to determine if it contains
   * a URL.
   *
   * The specification of the {@code meta} HTML tag can be found in:
   *   http://dev.w3.org/html5/spec/Overview.html#attr-meta-http-equiv-refresh
   *
   * <p>We return {@link HtmlUtils.META_REDIRECT_TYPE} indicating whether the
   * value contains a URL and whether we are at the start of the URL or past
   * the start. We are at the start of the URL if and only if one of the two
   * conditions below is true:
   * <ul>
   * <li>The given input does not contain any characters from the URL proper.
   * Example "5; URL=".
   * <li>The given input only contains the optional leading single or double
   * quote leading the URL. Example "5; URL='".
   * </li>
   * </ul>
   *
   * <p>Examples:
   * <ul>
   * <li> Example of a complete {@code meta} tag where the {@code content}
   * attribute contains a URL [we are not at the start of the URL]:
   * <pre>
   * &lt;meta http-equiv="refresh" content="5; URL=http://www.google.com"&gt;
   * </pre>
   * <li> Example of a complete {@code meta} tag where the {@code content}
   * attribute contains a URL [we are at the start of the URL]:
   * <pre>
   * &lt;meta http-equiv="refresh" content="5; URL="&gt;
   * </pre>
   * <li>Example of a complete {@code meta} tag where the {@code content}
   * attribute does not contain a URL:
   * <pre>
   * &lt;meta http-equiv="content-type" content="text/html"&gt;
   * </pre>
   * </ul>
   *
   * @param value {@code String} to parse
   * @return {@link HtmlUtils.META_REDIRECT_TYPE} indicating the presence
   * of a URL in the given value
   */
  public static META_REDIRECT_TYPE parseContentAttributeForUrl(String value) {
    if (value == null)
      return META_REDIRECT_TYPE.NONE;

    Matcher matcher = META_REDIRECT_PATTERN.matcher(value);
    if (!matcher.find())
      return META_REDIRECT_TYPE.NONE;

    // We have more content.
    if (value.length() > matcher.end())
      return META_REDIRECT_TYPE.URL;

    return META_REDIRECT_TYPE.URL_START;
  }
}