Home | History | Annotate | Download | only in util
      1 /*
      2  * Copyright (C) 2010 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.streamhtmlparser.util;
     18 
     19 import com.google.common.collect.ImmutableSortedSet;
     20 
     21 import java.util.Set;
     22 import java.util.regex.Pattern;
     23 import java.util.regex.Matcher;
     24 
     25 /**
     26  * Utility functions for HTML and Javascript that are most likely
     27  * not interesting to users outside this package.
     28  *
     29  * <p>The <code>HtmlParser</code> will be open-sourced hence we took the
     30  * decision to keep these utilities in this package as well as not to
     31  * leverage others that may exist in the <code>google3</code> code base.
     32  *
     33  * <p>The functionality exposed is designed to be 100% compatible with
     34  * the corresponding logic in the C-version of the HtmlParser as such
     35  * we are particularly concerned with cross-language compatibility.
     36  *
     37  * <p>Note: The words {@code Javascript} and {@code ECMAScript} are used
     38  * interchangeably unless otherwise noted.
     39  */
     40 public final class HtmlUtils {
     41 
     42   /**
     43    * static utility class
     44    */
     45   private HtmlUtils() {
     46   }  // COV_NF_LINE
     47 
     48   /**
     49    * Indicates the type of content contained in the {@code content} HTML
     50    * attribute of the {@code meta} HTML tag. Used by
     51    * {@link HtmlUtils#parseContentAttributeForUrl(String)}.
     52    * <p>The values are:
     53    * <ul>
     54    * <li>{@code NONE} if it does not contain a URL in the expected format.
     55    * <li>{@code URL_START} if it contains a URL but hasn't seen any of
     56    * its contents.
     57    * <li>{@code URL} if it contains a URL and has seen at least some of
     58    * its contents.
     59    * </ul>
     60    */
     61   public enum META_REDIRECT_TYPE {
     62     NONE,
     63     URL_START,
     64     URL
     65   }
     66 
     67   /**
     68    * A regular expression matching the format of a {@code content} attribute
     69    * that contains a URL. Used by {@link #parseContentAttributeForUrl}.
     70    */
     71   private static final String META_REDIRECT_REGEX =
     72       "^\\s*\\d*\\s*;\\s*URL\\s*=\\s*[\'\"]?";
     73 
     74   // Safe for use by concurrent threads so we compile once.
     75   private static final Pattern META_REDIRECT_PATTERN =
     76       Pattern.compile(META_REDIRECT_REGEX, Pattern.CASE_INSENSITIVE);
     77 
     78   /**
     79    * Set of keywords that can precede a regular expression literal. Taken from:
     80    * <a href="http://www.mozilla.org/js/language/js20-2000-07/rationale/syntax.html">
     81    * Language Syntax</a>
     82    *
     83    * <p>The token {@code void} was added to the list. Several keywords are
     84    * defined in Ecmascript 4 not Ecmascript 3. However, to keep the logic
     85    * simple we do not differentiate on the version and bundle them all together.
     86    */
     87   private static final Set<String> REGEXP_TOKEN_PREFIXS =
     88       ImmutableSortedSet.of(
     89           "abstract",
     90           "break",
     91           "case",
     92           "catch",
     93           "class",
     94           "const",
     95           "continue",
     96           "debugger",
     97           "default",
     98           "delete",
     99           "do",
    100           "else",
    101           "enum",
    102           "eval",
    103           "export",
    104           "extends",
    105           "field",
    106           "final",
    107           "finally",
    108           "for",
    109           "function",
    110           "goto",
    111           "if",
    112           "implements",
    113           "import",
    114           "in",
    115           "instanceof",
    116           "native",
    117           "new",
    118           "package",
    119           "private",
    120           "protected",
    121           "public",
    122           "return",
    123           "static",
    124           "switch",
    125           "synchronized",
    126           "throw",
    127           "throws",
    128           "transient",
    129           "try",
    130           "typeof",
    131           "var",
    132           "void",
    133           "volatile",
    134           "while",
    135           "with");
    136 
    137   /**
    138    * Set of all HTML attributes which expect a URI (as the value).
    139    * <a href="http://www.w3.org/TR/html4/index/attributes.html">Index of Attributes</a>
    140    */
    141   private static final Set<String> ATTRIBUTE_EXPECTS_URI =
    142       ImmutableSortedSet.of(
    143           "action",
    144           "archive",
    145           "background",
    146           "cite",
    147           "classid",
    148           "codebase",
    149           "data",
    150           "dynsrc",
    151           "href",
    152           "longdesc",
    153           "src",
    154           "usemap");
    155 
    156   /**
    157    * Set of {@code Character}s considered whitespace in Javascript.
    158    * See {@link #isJavascriptWhitespace(char)}
    159    */
    160   private static final Set<Character> JAVASCRIPT_WHITESPACE =
    161       ImmutableSortedSet.of(
    162             '\u0009',         /* Tab \t */
    163             '\n',             /* Line-Feed 0x0A */
    164             '\u000B',         /* Vertical Tab 0x0B */
    165             '\u000C',         /* Form Feed \f */
    166             '\r',             /* Carriage Return 0x0D */
    167             ' ',              /* Space 0x20 */
    168             '\u00A0',         /* Non-breaking space 0xA0 */
    169             '\u2028',         /* Line separator */
    170             '\u2029');        /* Paragraph separator */
    171 
    172   /**
    173   * Set of {@code Character}s considered whitespace in HTML.
    174   * See {@link #isHtmlSpace(char)}
    175   */
    176  private static final Set<Character> HTML_WHITESPACE =
    177       ImmutableSortedSet.of(
    178           ' ',
    179           '\t',
    180           '\n',
    181           '\r',
    182           '\u200B');
    183 
    184 
    185   /**
    186    * Determines if the HTML attribute specified expects javascript
    187    * for its value. Such is the case for example with the {@code onclick}
    188    * attribute.
    189    *
    190    * <p>Currently returns {@code true} for any attribute name that starts
    191    * with "on" which is not exactly correct but we trust a developer to
    192    * not use non-spec compliant attribute names (e.g. onbogus).
    193    *
    194    * @param attribute the name of an HTML attribute
    195    * @return {@code false} if the input is null or is not an attribute
    196    *         that expects javascript code; {@code true}
    197    */
    198   public static boolean isAttributeJavascript(String attribute) {
    199     return ((attribute != null) && attribute.startsWith("on"));
    200   }
    201 
    202   /**
    203    * Determines if the HTML attribute specified expects a {@code style}
    204    * for its value. Currently this is only true for the {@code style}
    205    * HTML attribute.
    206    *
    207    * @param attribute the name of an HTML attribute
    208    * @return {@code true} iff the attribute name is one that expects a
    209    *     style for a value; otherwise {@code false}
    210    */
    211   public static boolean isAttributeStyle(String attribute) {
    212     return "style".equals(attribute);
    213   }
    214 
    215   /**
    216    * Determines if the HTML attribute specified expects a {@code URI}
    217    * for its value. For example, both {@code href} and {@code src}
    218    * expect a {@code URI} but {@code style} does not. Returns
    219    * {@code false} if the attribute given was {@code null}.
    220    *
    221    * @param attribute the name of an HTML attribute
    222    * @return {@code true} if the attribute name is one that expects
    223    *         a URI for a value; otherwise {@code null}
    224    *
    225    * @see #ATTRIBUTE_EXPECTS_URI
    226    */
    227   public static boolean isAttributeUri(String attribute) {
    228     return ATTRIBUTE_EXPECTS_URI.contains(attribute);
    229   }
    230 
    231   /**
    232    * Determines if the specified character is an HTML whitespace character.
    233    * A character is an HTML whitespace character if and only if it is one
    234    * of the characters below.
    235    * <ul>
    236    * <li>A <code>Space</code> character
    237    * <li>A <code>Tab</code> character
    238    * <li>A <code>Line feed</code> character
    239    * <li>A <code>Carriage Return</code> character
    240    * <li>A <code>Zero-Width Space</code> character
    241    * </ul>
    242    *
    243    * Note: The list includes the zero-width space (<code>&amp;#x200B;</code>)
    244    * which is not included in the C version.
    245    *
    246    * @param chr the {@code char} to check
    247    * @return {@code true} if the character is an HTML whitespace character
    248    *
    249    * <a href="http://www.w3.org/TR/html401/struct/text.html#h-9.1">White space</a>
    250    */
    251   public static boolean isHtmlSpace(char chr) {
    252     return HTML_WHITESPACE.contains(chr);
    253   }
    254 
    255   /**
    256    * Determines if the specified character is an ECMAScript whitespace or line
    257    * terminator character. A character is a whitespace or line terminator if
    258    * and only if it is one of the characters below:
    259    * <ul>
    260    * <li>A white-space character (<code>Tab</code>, <code>Vertical Tab</code>,
    261    *     <code>Form Feed</code>, <code>Space</code>,
    262    *     <code>No-break space</code>)
    263    * <li>A line terminator character (<code>Line Feed</code>,
    264    *     <code>Carriage Return</code>, <code>Line separator</code>,
    265    *     <code>Paragraph Separator</code>).
    266    * </ul>
    267    *
    268    * <p>Encompasses the characters in sections 7.2 and 7.3 of ECMAScript 3, in
    269    * particular, this list is quite different from that in
    270    * <code>Character.isWhitespace</code>.
    271    * <a href="http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf">
    272    * ECMAScript Language Specification</a>
    273    *
    274    * @param chr the {@code char} to check
    275    * @return {@code true} or {@code false}
    276    *
    277    */
    278   public static boolean isJavascriptWhitespace(char chr) {
    279     return JAVASCRIPT_WHITESPACE.contains(chr);
    280   }
    281 
    282   /**
    283    * Determines if the specified character is a valid character in an
    284    * ECMAScript identifier. This determination is currently not exact,
    285    * in particular:
    286    * <ul>
    287    * <li>It does not accept Unicode letters, only ASCII ones.
    288    * <li>It does not distinguish between the first character of an identifier
    289    *     (which cannot contain numbers) and subsequent characters.
    290    * </li>
    291    * </ul>
    292    *
    293    * We are considering leveraging <code>Character.isJavaIdentifierStart</code>
    294    * and <code>Character.isJavaIdentifierPart</code> given that Java
    295    * and Javascript follow similar identifier naming rules but we lose
    296    * compatibility with the C-version.
    297    *
    298    * @param chr {@code char} to check
    299    * @return {@code true} if the {@code chr} is a Javascript whitespace
    300    *         character; otherwise {@code false}
    301    */
    302   public static boolean isJavascriptIdentifier(char chr) {
    303     return ((chr >= 'a' && chr <= 'z')
    304         || (chr >= 'A' && chr <= 'Z')
    305         || (chr >= '0' && chr <= '9')
    306         || chr == '_' || chr == '$');
    307   }
    308 
    309   /**
    310    * Determines if the input token provided is a valid token prefix to a
    311    * javascript regular expression.  The token argument is compared against
    312    * a {@code Set} of identifiers that can precede a regular expression in the
    313    * javascript grammar, and returns {@code true} if the provided
    314    * {@code String} is in that {@code Set}.
    315    *
    316    * @param input the {@code String} token to check
    317    * @return {@code true} iff the token is a valid prefix of a regexp
    318    */
    319   public static boolean isJavascriptRegexpPrefix(String input) {
    320     return REGEXP_TOKEN_PREFIXS.contains(input);
    321   }
    322 
    323   /**
    324    * Encodes the specified character using Ascii for convenient insertion into
    325    * a single-quote enclosed {@code String}. Printable characters
    326    * are returned as-is. Carriage Return, Line Feed, Horizontal Tab,
    327    * back-slash and single quote are all backslash-escaped. All other characters
    328    * are returned hex-encoded.
    329    *
    330    * @param chr {@code char} to encode
    331    * @return an Ascii-friendly encoding of the given {@code char}
    332    */
    333   public static String encodeCharForAscii(char chr) {
    334     if (chr == '\'') {
    335       return "\\'";
    336     } else if (chr == '\\') {
    337       return "\\\\";
    338     } else if (chr >= 32 && chr <= 126) {
    339       return String.format("%c", chr);
    340     } else if (chr == '\n') {
    341       return "\\n";
    342     } else if (chr == '\r') {
    343       return "\\r";
    344     } else if (chr == '\t') {
    345       return "\\t";
    346     } else {
    347       // Cannot apply a precision specifier for integral types. Specifying
    348       // 0-padded hex-encoding with minimum width of two.
    349       return String.format("\\u%04x", (int)chr);
    350     }
    351   }
    352 
    353   /**
    354    * Parses the given {@code String} to determine if it contains a URL in the
    355    * format followed by the {@code content} attribute of the {@code meta}
    356    * HTML tag.
    357    *
    358    * <p>This function expects to receive the value of the {@code content} HTML
    359    * attribute. This attribute takes on different meanings depending on the
    360    * value of the {@code http-equiv} HTML attribute of the same {@code meta}
    361    * tag. Since we may not have access to the {@code http-equiv} attribute,
    362    * we instead rely on parsing the given value to determine if it contains
    363    * a URL.
    364    *
    365    * The specification of the {@code meta} HTML tag can be found in:
    366    *   http://dev.w3.org/html5/spec/Overview.html#attr-meta-http-equiv-refresh
    367    *
    368    * <p>We return {@link HtmlUtils.META_REDIRECT_TYPE} indicating whether the
    369    * value contains a URL and whether we are at the start of the URL or past
    370    * the start. We are at the start of the URL if and only if one of the two
    371    * conditions below is true:
    372    * <ul>
    373    * <li>The given input does not contain any characters from the URL proper.
    374    * Example "5; URL=".
    375    * <li>The given input only contains the optional leading single or double
    376    * quote leading the URL. Example "5; URL='".
    377    * </li>
    378    * </ul>
    379    *
    380    * <p>Examples:
    381    * <ul>
    382    * <li> Example of a complete {@code meta} tag where the {@code content}
    383    * attribute contains a URL [we are not at the start of the URL]:
    384    * <pre>
    385    * &lt;meta http-equiv="refresh" content="5; URL=http://www.google.com"&gt;
    386    * </pre>
    387    * <li> Example of a complete {@code meta} tag where the {@code content}
    388    * attribute contains a URL [we are at the start of the URL]:
    389    * <pre>
    390    * &lt;meta http-equiv="refresh" content="5; URL="&gt;
    391    * </pre>
    392    * <li>Example of a complete {@code meta} tag where the {@code content}
    393    * attribute does not contain a URL:
    394    * <pre>
    395    * &lt;meta http-equiv="content-type" content="text/html"&gt;
    396    * </pre>
    397    * </ul>
    398    *
    399    * @param value {@code String} to parse
    400    * @return {@link HtmlUtils.META_REDIRECT_TYPE} indicating the presence
    401    * of a URL in the given value
    402    */
    403   public static META_REDIRECT_TYPE parseContentAttributeForUrl(String value) {
    404     if (value == null)
    405       return META_REDIRECT_TYPE.NONE;
    406 
    407     Matcher matcher = META_REDIRECT_PATTERN.matcher(value);
    408     if (!matcher.find())
    409       return META_REDIRECT_TYPE.NONE;
    410 
    411     // We have more content.
    412     if (value.length() > matcher.end())
    413       return META_REDIRECT_TYPE.URL;
    414 
    415     return META_REDIRECT_TYPE.URL_START;
    416   }
    417 }
    418