Home | History | Annotate | Download | only in streamhtmlparser
      1 /*
      2  * Copyright (C) 2010 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.streamhtmlparser;
     18 
     19 /**
     20  * Methods exposed for HTML parsing of text to facilitate implementation
     21  * of Automatic context-aware escaping. The HTML parser also embeds a
     22  * Javascript parser for processing Javascript fragments. In the future,
     23  * it will also embed other specific parsers and hence most likely remain
     24  * the main interface to callers of this package.
     25  *
     26  * <p>Note: These are the exact methods exposed in the original C++ Parser. The
     27  * names are simply modified to conform to Java.
     28  */
     29 public interface HtmlParser extends Parser {
     30 
     31   /**
     32    * The Parser Mode requested for parsing a given template.
     33    * Currently we support:
     34    * <ul>
     35    * <li>{@code HTML} for HTML templates.
     36    * <li>{@code JS} for javascript templates.
     37    * <li>{@code CSS} for Cascading Style-Sheets templates.
     38    * <li>{@code HTML_IN_TAG} for HTML templates that consist only of
     39    *     HTML attribute name and value pairs. This is typically the case for
     40    *     a template that is being included from a parent template where the
     41    *     parent template contains the start and the closing of the HTML tag.
     42    *     This is a special mode, for standard HTML templates please use
     43    *     {@link #HTML}.
     44    *     An example of such as template is:
     45    *     <p><code>class="someClass" target="_blank"</code></p>
     46    *     <p>Which could be included from a parent template that contains
     47    *     an anchor tag, say:</p>
     48    *     <p><code>&lt;a href="/bla" ["INCLUDED_TEMPLATE"]&gt;</code></p>
     49    * </ul>
     50    */
     51   public enum Mode {
     52     HTML,
     53     JS,
     54     CSS,
     55     HTML_IN_TAG
     56   }
     57 
     58   /**
     59    * Indicates the type of HTML attribute that the parser is currently in or
     60    * {@code NONE} if the parser is not currently in an attribute.
     61    * {@code URI} is for attributes taking a URI such as "href" and "src".
     62    * {@code JS} is for attributes taking javascript such as "onclick".
     63    * {@code STYLE} is for the "style" attribute.
     64    * All other attributes fall under {@code REGULAR}.
     65    *
     66    * Returned by {@link HtmlParser#getAttributeType()}
     67    */
     68   public enum ATTR_TYPE {
     69     NONE,
     70     REGULAR,
     71     URI,
     72     JS,
     73     STYLE
     74   }
     75 
     76   /**
     77    * All the states in which the parser can be. These are external states.
     78    * The parser has many more internal states that are not exposed and which
     79    * are instead mapped to one of these external ones.
     80    * {@code STATE_TEXT} the parser is in HTML proper.
     81    * {@code STATE_TAG} the parser is inside an HTML tag name.
     82    * {@code STATE_COMMENT} the parser is inside an HTML comment.
     83    * {@code STATE_ATTR} the parser is inside an HTML attribute name.
     84    * {@code STATE_VALUE} the parser is inside an HTML attribute value.
     85    * {@code STATE_JS_FILE} the parser is inside javascript code.
     86    * {@code STATE_CSS_FILE} the parser is inside CSS code.
     87    *
     88    * <p>All these states map exactly to those exposed in the C++ (original)
     89    * version of the HtmlParser.
     90    */
     91   public final static ExternalState STATE_TEXT =
     92       new ExternalState("STATE_TEXT");
     93   public final static ExternalState STATE_TAG =
     94       new ExternalState("STATE_TAG");
     95   public final static ExternalState STATE_COMMENT =
     96       new ExternalState("STATE_COMMENT");
     97   public final static ExternalState STATE_ATTR =
     98       new ExternalState("STATE_ATTR");
     99   public final static ExternalState STATE_VALUE =
    100       new ExternalState("STATE_VALUE");
    101   public final static ExternalState STATE_JS_FILE =
    102       new ExternalState("STATE_JS_FILE");
    103   public final static ExternalState STATE_CSS_FILE =
    104       new ExternalState("STATE_CSS_FILE");
    105 
    106   /**
    107    * Returns {@code true} if the parser is currently processing Javascript.
    108    * Such is the case if and only if, the parser is processing an attribute
    109    * that takes Javascript, a Javascript script block or the parser
    110    * is (re)set with {@link Mode#JS}.
    111    *
    112    * @return {@code true} if the parser is processing Javascript,
    113    *         {@code false} otherwise
    114    */
    115   public boolean inJavascript();
    116 
    117   /**
    118    * Returns {@code true} if the parser is currently processing
    119    * a Javascript litteral that is quoted. The caller will typically
    120    * invoke this method after determining that the parser is processing
    121    * Javascript. Knowing whether the element is quoted or not helps
    122    * determine which escaping to apply to it when needed.
    123    *
    124    * @return {@code true} if and only if the parser is inside a quoted
    125    *         Javascript literal
    126    */
    127   public boolean isJavascriptQuoted();
    128 
    129 
    130   /**
    131    * Returns {@code true} if and only if the parser is currently within
    132    * an attribute, be it within the attribute name or the attribute value.
    133    *
    134    * @return {@code true} if and only if inside an attribute
    135    */
    136   public boolean inAttribute();
    137 
    138   /**
    139    * Returns {@code true} if and only if the parser is currently within
    140    * a CSS context. A CSS context is one of the below:
    141    * <ul>
    142    * <li>Inside a STYLE tag.
    143    * <li>Inside a STYLE attribute.
    144    * <li>Inside a CSS file when the parser was reset in the CSS mode.
    145    * </ul>
    146    *
    147    * @return {@code true} if and only if the parser is inside CSS
    148    */
    149   public boolean inCss();
    150 
    151   /**
    152    * Returns the type of the attribute that the parser is in
    153    * or {@code ATTR_TYPE.NONE} if we are not parsing an attribute.
    154    * The caller will typically invoke this method after determining
    155    * that the parser is processing an attribute.
    156    *
    157    * <p>This is useful to determine which escaping to apply based
    158    * on the type of value this attribute expects.
    159    *
    160    * @return type of the attribute
    161    * @see HtmlParser.ATTR_TYPE
    162    */
    163   public ATTR_TYPE getAttributeType();
    164 
    165   /**
    166    * Returns {@code true} if and only if the parser is currently within
    167    * an attribute value and that attribute value is quoted.
    168    *
    169    * @return {@code true} if and only if the attribute value is quoted
    170    */
    171   public boolean isAttributeQuoted();
    172 
    173 
    174   /**
    175    * Returns the name of the HTML tag if the parser is currently within one.
    176    * Note that the name may be incomplete if the parser is currently still
    177    * parsing the name. Returns an empty {@code String} if the parser is not
    178    * in a tag as determined by {@code getCurrentExternalState}.
    179    *
    180    * @return the name of the HTML tag or an empty {@code String} if we are
    181    *         not within an HTML tag
    182    */
    183   public String getTag();
    184 
    185   /**
    186    * Returns the name of the HTML attribute the parser is currently processing.
    187    * If the parser is still parsing the name, then the returned name
    188    * may be incomplete. Returns an empty {@code String} if the parser is not
    189    * in an attribute as determined by {@code getCurrentExternalState}.
    190    *
    191    * @return the name of the HTML attribute or an empty {@code String}
    192    *         if we are not within an HTML attribute
    193    */
    194   public String getAttribute();
    195 
    196   /**
    197    * Returns the value of an HTML attribute if the parser is currently
    198    * within one. If the parser is currently parsing the value, the returned
    199    * value may be incomplete. The caller will typically first determine
    200    * that the parser is processing a value by calling
    201    * {@code getCurrentExternalState}.
    202    *
    203    * @return the value, could be an empty {@code String} if the parser is not
    204    *         in an HTML attribute value
    205    */
    206   public String getValue();
    207 
    208   /**
    209    * Returns the current position of the parser within the HTML attribute
    210    * value, zero being the position of the first character in the value.
    211    * The caller will typically first determine that the parser is
    212    * processing a value by calling {@link #getState()}.
    213    *
    214    * @return the index or zero if the parser is not processing a value
    215    */
    216   public int getValueIndex();
    217 
    218   /**
    219    * Returns {@code true} if and only if the current position of the parser is
    220    * at the start of a URL HTML attribute value. This is the case when the
    221    * following three conditions are all met:
    222    * <p>
    223    * <ol>
    224    * <li>The parser is in an HTML attribute value.
    225    * <li>The HTML attribute expects a URL, as determined by
    226    *     {@link #getAttributeType()} returning {@code .ATTR_TYPE#URI}.
    227    * <li>The parser has not yet seen any characters from that URL.
    228    * </ol>
    229    *
    230    * <p> This method may be used by an Html Sanitizer or an Auto-Escape system
    231    * to determine whether to validate the URL for well-formedness and validate
    232    * the scheme of the URL (e.g. {@code HTTP}, {@code HTTPS}) is safe.
    233    * In particular, it is recommended to use this method instead of
    234    * checking that {@link #getValueIndex()} is {@code 0} to support attribute
    235    * types where the URL does not start at index zero, such as the
    236    * {@code content} attribute of the {@code meta} HTML tag.
    237    *
    238    * @return {@code true} if and only if the parser is at the start of the URL
    239    */
    240   public boolean isUrlStart();
    241 
    242   /**
    243    * Resets the state of the parser, allowing for reuse of the
    244    * {@code HtmlParser} object.
    245    *
    246    * <p>See the {@link HtmlParser.Mode} enum for information on all
    247    * the valid modes.
    248    *
    249    * @param mode is an enum representing the high-level state of the parser
    250    */
    251   public void resetMode(HtmlParser.Mode mode);
    252 
    253   /**
    254    * A specialized directive to tell the parser there is some content
    255    * that will be inserted here but that it will not get to parse. Used
    256    * by the template system that may not be able to give some content
    257    * to the parser but wants it to know there typically will be content
    258    * inserted at that point. This is a hint used in corner cases within
    259    * parsing of HTML attribute names and values where content we do not
    260    * get to see could affect our parsing and alter our current state.
    261    *
    262    * <p>Returns {@code false} if and only if the parser encountered
    263    * a fatal error which prevents it from continuing further parsing.
    264    *
    265    * <p>Note: The return value is different from the C++ Parser which
    266    * always returns {@code true} but in my opinion makes more sense.
    267    *
    268    * @throws ParseException if an unrecoverable error occurred during parsing
    269    */
    270   public void insertText() throws ParseException;
    271 
    272   /**
    273    * Returns the state the Javascript parser is in.
    274    *
    275    * <p>See {@link JavascriptParser} for more information on the valid
    276    * external states. The caller will typically first determine that the
    277    * parser is processing Javascript and then invoke this method to
    278    * obtain more fine-grained state information.
    279    *
    280    * @return external state of the javascript parser
    281    */
    282   public ExternalState getJavascriptState();
    283 }
    284