Home | History | Annotate | Download | only in streamhtmlparser
      1 /*
      2  * Copyright (C) 2010 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.streamhtmlparser;
     18 
     19 import com.google.streamhtmlparser.impl.HtmlParserImpl;
     20 
     21 import java.util.Set;
     22 import java.util.logging.Logger;
     23 
     24 /**
     25  * A factory class to obtain instances of an {@link HtmlParser}.
     26  * Currently each instance is a new object given these are fairly
     27  * light-weight.
     28  *
     29  * <p>In the unlikely case that this class fails to initialize properly
     30  * (a developer error), an error is emitted to the error console and the logs
     31  * and the specialized parser creation methods will throw
     32  * an {@link AssertionError} on all invokations.
     33  */
     34 public class HtmlParserFactory {
     35 
     36   private static final Logger logger =
     37       Logger.getLogger(HtmlParserFactory.class.getName());
     38 
     39   /**
     40    * To provide additional options when creating an {@code HtmlParser} using
     41    * {@link HtmlParserFactory#createParserInAttribute(HtmlParser.ATTR_TYPE,
     42    *        boolean, Set)}
     43    */
     44   public enum AttributeOptions {
     45 
     46     /**
     47      * Indicates that the attribute value is Javascript-quoted. Only takes
     48      * effect for Javascript-accepting attributes - as identified by
     49      * {@link HtmlParser.ATTR_TYPE#JS} - and only when the attribute is also
     50      * HTML quoted.
     51      */
     52     JS_QUOTED,
     53 
     54     /**
     55      * Indicates the attribute value is only a part of a URL as opposed to a
     56      * full URL. In particular, the value is not at the start of a URL and
     57      * hence does not necessitate validation of the URL scheme.
     58      * Only valid for URI-accepting attributes - as identified by
     59      * {@link HtmlParser.ATTR_TYPE#URI}.
     60      */
     61     URL_PARTIAL,
     62   }
     63 
     64   /**
     65    * To provide additional options when creating an {@code HtmlParser} using
     66    * {@link HtmlParserFactory#createParserInMode(HtmlParser.Mode, Set)}
     67    */
     68   public enum ModeOptions {
     69 
     70     /**
     71      * Indicates that the parser is inside a quoted {@code String}. Only
     72      * valid in the {@link HtmlParser.Mode#JS} mode.
     73      */
     74     JS_QUOTED
     75   }
     76 
     77   private static final HtmlParser parserInDefaultAttr = createParser();
     78   private static final HtmlParser parserInDefaultAttrQ = createParser();
     79   private static final HtmlParser parserInUriAttrComplete = createParser();
     80   private static final HtmlParser parserInUriAttrQComplete = createParser();
     81   private static final HtmlParser parserInUriAttrPartial = createParser();
     82   private static final HtmlParser parserInUriAttrQPartial = createParser();
     83   private static final HtmlParser parserInJsAttr = createParser();
     84   private static final HtmlParser parserInJsAttrQ = createParser();
     85   private static final HtmlParser parserInQJsAttr = createParser();
     86   private static final HtmlParser parserInStyleAttr = createParser();
     87   private static final HtmlParser parserInStyleAttrQ = createParser();
     88   private static final HtmlParser parserInJsQ = createParser();
     89 
     90   /**
     91    * Protects all the createParserXXX methods by throwing a run-time exception
     92    * if this class failed to initialize properly.
     93    */
     94   private static boolean initSuccess = false;
     95 
     96   static {
     97     try {
     98       initializeParsers();
     99       initSuccess = true;
    100     } catch (ParseException e) {
    101       // Log a severe error and print it to stderr along with a stack trace.
    102       String error = HtmlParserFactory.class.getName() +
    103                      " Failed initialization: " + e.getMessage();
    104       logger.severe(error);
    105       System.err.println(error);
    106       e.printStackTrace();
    107     }
    108   }
    109 
    110   // Static class.
    111   private HtmlParserFactory() {
    112   }  // COV_NF_LINE
    113 
    114   /**
    115    * Returns an {@code HtmlParser} object ready to parse HTML input.
    116    *
    117    * @return an {@code HtmlParser} in the provided mode
    118    */
    119   public static HtmlParser createParser() {
    120     return new HtmlParserImpl();
    121   }
    122 
    123   /**
    124    * Returns an {@code HtmlParser} object initialized with the
    125    * requested Mode. Provide non {@code null} options to provide
    126    * a more precise initialization with the desired Mode.
    127    *
    128    * @param mode the mode to reset the parser with
    129    * @param options additional options or {@code null} for none
    130    * @return an {@code HtmlParser} in the provided mode
    131    * @throws AssertionError when this class failed to initialize
    132    */
    133   public static HtmlParser createParserInMode(HtmlParser.Mode mode,
    134                                               Set<ModeOptions> options) {
    135     requireInitialized();
    136 
    137     if (options != null && options.contains(ModeOptions.JS_QUOTED))
    138       return createParser(parserInJsQ);
    139 
    140     // With no options given, this method is just a convenience wrapper for
    141     // the two calls below.
    142     HtmlParser parser = new HtmlParserImpl();
    143     parser.resetMode(mode);
    144     return parser;
    145   }
    146 
    147   /**
    148    * Returns an {@code HtmlParser} that is a copy of the one
    149    * supplied. It holds the same internal state and hence can
    150    * proceed with parsing in-lieu of the supplied parser.
    151    *
    152    * @param aHtmlParser a {@code HtmlParser} to copy from
    153    * @return an {@code HtmlParser} that is a copy of the provided one
    154    * @throws AssertionError when this class failed to initialize
    155    */
    156   public static HtmlParser createParser(HtmlParser aHtmlParser) {
    157     requireInitialized();
    158 
    159     // Should never get a ClassCastException since there is only one
    160     // implementation of the HtmlParser interface.
    161     return new HtmlParserImpl((HtmlParserImpl) aHtmlParser);
    162   }
    163 
    164   /**
    165    * A very specialized {@code HtmlParser} accessor that returns a parser
    166    * in a state where it expects to read the value of an attribute
    167    * of an HTML tag. This is only useful when the parser has not seen a
    168    * certain HTML tag and an attribute name and needs to continue parsing
    169    * from a state as though it has.
    170    *
    171    * <p>For example, to create a parser in a state akin to that
    172    * after the parser has parsed "&lt;a href=\"", invoke:
    173    * <pre>
    174    *   createParserInAttribute(HtmlParser.ATTR_TYPE.URI, true)}
    175    * </pre>
    176    *
    177    * <p>You must provide the proper value of quoting or the parser
    178    * will go into an unexpected state.
    179    * As a special-case, when called with the {@code HtmlParser.ATTR_TYPE}
    180    * of {@code HtmlParser.ATTR_TYPE.NONE}, the parser is created in a state
    181    * inside an HTML tag where it expects an attribute name not an attribute
    182    * value. It becomes equivalent to a parser initialized in the
    183    * {@code HTML_IN_TAG} mode.
    184    *
    185    * @param attrtype the attribute type which the parser should be in
    186    * @param quoted whether the attribute value is enclosed in double quotes
    187    * @param options additional options or {@code null} for none
    188    * @return an {@code HtmlParser} initialized in the given attribute type
    189    *         and quoting
    190    * @throws AssertionError when this class failed to initialize
    191    */
    192   public static HtmlParser createParserInAttribute(
    193       HtmlParser.ATTR_TYPE attrtype,
    194       boolean quoted, Set<AttributeOptions> options) {
    195     requireInitialized();
    196 
    197     HtmlParser parser;
    198     switch (attrtype) {
    199       case REGULAR:
    200         parser = createParser(
    201             quoted ? parserInDefaultAttrQ : parserInDefaultAttr);
    202         break;
    203       case URI:
    204         if (options != null && options.contains(AttributeOptions.URL_PARTIAL))
    205           parser = createParser(
    206               quoted ? parserInUriAttrQPartial : parserInUriAttrPartial);
    207         else
    208           parser = createParser(
    209               quoted ? parserInUriAttrQComplete : parserInUriAttrComplete);
    210         break;
    211       case JS:
    212         // Note: We currently do not support the case of the value being
    213         // inside a Javascript quoted string that is in an unquoted HTML
    214         // attribute, such as <a href=bla onmouseover=alert('[VALUE')>.
    215         // It would be simple to add but currently we assume Javascript
    216         // quoted attribute values are always HTML quoted.
    217         if (quoted) {
    218           if (options != null && options.contains(AttributeOptions.JS_QUOTED))
    219             parser = createParser(parserInQJsAttr);
    220           else
    221             parser = createParser(parserInJsAttrQ);
    222         } else {
    223           parser = createParser(parserInJsAttr);
    224         }
    225         break;
    226       case STYLE:
    227         parser = createParser(
    228             quoted ? parserInStyleAttrQ : parserInStyleAttr);
    229         break;
    230       case NONE:
    231         parser = createParserInMode(HtmlParser.Mode.HTML_IN_TAG, null);
    232         break;
    233       default:
    234         throw new IllegalArgumentException(
    235             "Did not recognize ATTR_TYPE given: " + attrtype);
    236     }
    237     return parser;
    238   }
    239 
    240   /**
    241    * Initializes a set of static parsers to be subsequently used
    242    * by the various createParserXXX methods.
    243    * The parsers are set to their proper states by making them parse
    244    * an appropriate HTML input fragment. This approach is the most likely
    245    * to ensure all their internal state is consistent.
    246    *
    247    * <p>In the very unexpected case of the parsing failing (developer error),
    248    * this class will fail to initialize properly.
    249    *
    250    * <p>In addition:
    251    * <ul>
    252    * <li>The HTML tag is set to a fictitious name {@code xparsertag}.
    253    * <li>The attribute name is chosen to match the required attribute type.
    254    *     When several possibilities exist, one is chosen arbitrarily.
    255    * <li>If quoting is required, a double quote is provided after the '='.
    256    * </ul>
    257    *
    258    * @throws ParseException if parsing failed.
    259    */
    260   private static void initializeParsers() throws ParseException {
    261     parserInDefaultAttr.parse("<xparsertag htmlparser=");
    262     parserInDefaultAttrQ.parse("<xparsertag htmlparser=\"");
    263 
    264     // Chosing the "src" attribute, one of several possible names here
    265     parserInUriAttrComplete.parse("<xparsertag src=");
    266     parserInUriAttrQComplete.parse("<xparsertag src=\"");
    267 
    268     // To support a parser that is initialized within a URL parameter
    269     // rather than at the beginning of a URL. We use a fake domain
    270     // (example.com from RFC 2606 <http://www.rfc-editor.org/rfc/rfc2606.txt>)
    271     // and a fake query parameter.
    272     final String fakeUrlPrefix = "http://example.com/fakequeryparam=";
    273     parserInUriAttrPartial.parse("<xparsertag src=" + fakeUrlPrefix);
    274     parserInUriAttrQPartial.parse("<xparsertag src=\"" + fakeUrlPrefix);
    275 
    276     // Using onmouse= which is a fictitious attribute name that the parser
    277     // understands as being a valid javascript-enabled attribute. Chosing fake
    278     // names may help during debugging.
    279     parserInJsAttr.parse("<xparsertag onmouse=");
    280     parserInJsAttrQ.parse("<xparsertag onmouse=\"");
    281     // Single quote added as the Javascript is itself quoted.
    282     parserInQJsAttr.parse("<xparsertag onmouse=\"'");
    283 
    284     // A parser in the Javascript context within a (single) quoted string.
    285     parserInJsQ.resetMode(HtmlParser.Mode.JS);
    286     parserInJsQ.parse("var fakeparservar='");
    287 
    288     // Chosing the "style" attribute as it is the only option
    289     parserInStyleAttr.parse("<xparsertag style=");
    290     parserInStyleAttrQ.parse("<xparsertag style=\"");
    291   }
    292 
    293   /**
    294    * Throws an {@link AssertionError} if the class was not initialized
    295    * correctly, otherwise simply returns. This is to protect against the
    296    * possibility the needed parsers were not created successfully during
    297    * static initialized, which can only happen due to an error during
    298    * development of this library.
    299    *
    300    * @throws AssertionError when this class failed to initialize
    301    */
    302   private static void requireInitialized() {
    303     if (!initSuccess)
    304       throw new AssertionError("HtmlParserFactory failed initialization.");
    305   }
    306 }
    307