Home | History | Annotate | Download | only in impl
      1 /*
      2  * Copyright (C) 2010 Google Inc.
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.google.streamhtmlparser.impl;
     18 
     19 import com.google.common.base.Preconditions;
     20 import com.google.common.collect.Maps;
     21 import com.google.streamhtmlparser.ExternalState;
     22 import com.google.streamhtmlparser.HtmlParser;
     23 import com.google.streamhtmlparser.ParseException;
     24 import com.google.streamhtmlparser.util.CharacterRecorder;
     25 import com.google.streamhtmlparser.util.EntityResolver;
     26 import com.google.streamhtmlparser.util.HtmlUtils;
     27 
     28 import java.util.Map;
     29 
     30 /**
     31  * A custom specialized parser - ported from the main C++ version - used to
     32  * implement context-aware escaping of run-time data in web-application
     33  * templates.
     34  *
     35  * <p>This is the main class in the package. It implements the
     36  * {@code HtmlParser} interface.
     37  *
     38  * <p>This class is not thread-safe, in particular you cannot invoke any
     39  * state changing operations (such as {@code parse} from multiple threads
     40  * on the same object.
     41  *
     42  * <p>If you are looking at this class, chances are very high you are
     43  * implementing Auto-Escaping for a new template system. Please see the
     44  * landing page including a design document at
     45  * <a href="http://go/autoescape">Auto-Escape Landing Page</a>.
     46  */
     47 public class HtmlParserImpl extends GenericParser implements HtmlParser {
     48 
     49   /*
     50    * Internal representation of the parser state, which is at a
     51    * finer-granularity than the external state as given to callers.
     52    * The relationship between <code>InternalState</code> and
     53    * <code>ExternalState</code> is a many-to-one relationship.
     54    */
     55   private static final InternalState TEXT;
     56   private static final InternalState TAG_START;
     57   private static final InternalState TAG_NAME;
     58   private static final InternalState DECL_START;
     59   private static final InternalState DECL_BODY;
     60   private static final InternalState COM_OPEN;
     61   private static final InternalState COM_BODY;
     62   private static final InternalState COM_DASH;
     63   private static final InternalState COM_DASH_DASH;
     64   private static final InternalState PI;
     65   private static final InternalState PI_MAY_END;
     66   private static final InternalState TAG_SPACE;
     67   private static final InternalState TAG_CLOSE;
     68   private static final InternalState ATTR;
     69   private static final InternalState ATTR_SPACE;
     70   private static final InternalState VALUE;
     71   private static final InternalState VALUE_TEXT;
     72   private static final InternalState VALUE_Q_START;
     73   private static final InternalState VALUE_Q;
     74   private static final InternalState VALUE_DQ_START;
     75   private static final InternalState VALUE_DQ;
     76   private static final InternalState CDATA_COM_START;
     77   private static final InternalState CDATA_COM_START_DASH;
     78   private static final InternalState CDATA_COM_BODY;
     79   private static final InternalState CDATA_COM_DASH;
     80   private static final InternalState CDATA_COM_DASH_DASH;
     81   private static final InternalState CDATA_TEXT;
     82   private static final InternalState CDATA_LT;
     83   private static final InternalState CDATA_MAY_CLOSE;
     84   private static final InternalState JS_FILE;
     85   private static final InternalState CSS_FILE;
     86 
     87   static {
     88     TEXT = InternalState.getInstanceHtml("TEXT");
     89     TAG_START = InternalState.getInstanceHtml("TAG_START");
     90     TAG_NAME = InternalState.getInstanceHtml("TAG_NAME");
     91     DECL_START = InternalState.getInstanceHtml("DECL_START");
     92     DECL_BODY = InternalState.getInstanceHtml("DECL_BODY");
     93     COM_OPEN = InternalState.getInstanceHtml("COM_OPEN");
     94     COM_BODY = InternalState.getInstanceHtml("COM_BODY");
     95     COM_DASH = InternalState.getInstanceHtml("COM_DASH");
     96     COM_DASH_DASH = InternalState.getInstanceHtml("COM_DASH_DASH");
     97     PI =InternalState.getInstanceHtml("PI");
     98     PI_MAY_END = InternalState.getInstanceHtml("PI_MAY_END");
     99     TAG_SPACE = InternalState.getInstanceHtml("TAG_SPACE");
    100     TAG_CLOSE = InternalState.getInstanceHtml("TAG_CLOSE");
    101     ATTR = InternalState.getInstanceHtml("ATTR");
    102     ATTR_SPACE = InternalState.getInstanceHtml("ATTR_SPACE");
    103     VALUE = InternalState.getInstanceHtml("VALUE");
    104     VALUE_TEXT = InternalState.getInstanceHtml("VALUE_TEXT");
    105     VALUE_Q_START = InternalState.getInstanceHtml("VALUE_Q_START");
    106     VALUE_Q = InternalState.getInstanceHtml("VALUE_Q");
    107     VALUE_DQ_START = InternalState.getInstanceHtml("VALUE_DQ_START");
    108     VALUE_DQ = InternalState.getInstanceHtml("VALUE_DQ");
    109     CDATA_COM_START = InternalState.getInstanceHtml("CDATA_COM_START");
    110     CDATA_COM_START_DASH =
    111         InternalState.getInstanceHtml("CDATA_COM_START_DASH");
    112     CDATA_COM_BODY = InternalState.getInstanceHtml("CDATA_COM_BODY");
    113     CDATA_COM_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH");
    114     CDATA_COM_DASH_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH_DASH");
    115     CDATA_TEXT = InternalState.getInstanceHtml("CDATA_TEXT");
    116     CDATA_LT = InternalState.getInstanceHtml("CDATA_LT");
    117     CDATA_MAY_CLOSE = InternalState.getInstanceHtml("CDATA_MAY_CLOSE");
    118     JS_FILE = InternalState.getInstanceHtml("JS_FILE");
    119     CSS_FILE = InternalState.getInstanceHtml("CSS_FILE");
    120   }
    121 
    122   private static final Map<InternalState, ExternalState> STATE_MAPPING =
    123       Maps.newHashMap();
    124   static {
    125     initializeStateMapping();
    126   }
    127 
    128   private static final ParserStateTable STATE_TABLE = new ParserStateTable();
    129   static {
    130     initializeParserStateTable();
    131   }
    132 
    133   private final CharacterRecorder tag;
    134   private final CharacterRecorder attr;
    135   private final CharacterRecorder value;
    136   private final CharacterRecorder cdataCloseTag;
    137   private final EntityResolver entityResolver;
    138   private final JavascriptParserImpl jsParser;
    139   private boolean insideJavascript;
    140   private int valueIndex;
    141   // True iff InsertText() was called at the start of a URL attribute value.
    142   private boolean textInsideUrlValue;
    143 
    144   /**
    145    * Creates an {@code HtmlParserImpl} object.
    146    *
    147    * <p>Both for performance reasons and to leverage code a state-flow machine
    148    * that is automatically generated from Python for multiple target
    149    * languages, this object uses a static {@code ParserStateTable} that
    150    * is read-only and obtained from the generated code in {@code HtmlParserFsm}.
    151    * That code also maintains the mapping from internal states
    152    * ({@code InternalState}) to external states ({@code ExternalState}).
    153    */
    154   public HtmlParserImpl() {
    155     super(STATE_TABLE, STATE_MAPPING, TEXT);
    156     tag = new CharacterRecorder();
    157     attr = new CharacterRecorder();
    158     value = new CharacterRecorder();
    159     cdataCloseTag = new CharacterRecorder();
    160     entityResolver = new EntityResolver();
    161     jsParser = new JavascriptParserImpl();
    162     insideJavascript = false;
    163     valueIndex = 0;
    164     textInsideUrlValue = false;
    165   }
    166 
    167   /**
    168    * Creates an {@code HtmlParserImpl} that is a copy of the one provided.
    169    *
    170    * @param aHtmlParserImpl the {@code HtmlParserImpl} object to copy
    171    */
    172   public HtmlParserImpl(HtmlParserImpl aHtmlParserImpl) {
    173     super(aHtmlParserImpl);
    174     tag = new CharacterRecorder(aHtmlParserImpl.tag);
    175     attr = new CharacterRecorder(aHtmlParserImpl.attr);
    176     value = new CharacterRecorder(aHtmlParserImpl.value);
    177     cdataCloseTag = new CharacterRecorder(aHtmlParserImpl.cdataCloseTag);
    178     entityResolver = new EntityResolver(aHtmlParserImpl.entityResolver);
    179     jsParser = new JavascriptParserImpl(aHtmlParserImpl.jsParser);
    180     insideJavascript = aHtmlParserImpl.insideJavascript;
    181     valueIndex = aHtmlParserImpl.valueIndex;
    182     textInsideUrlValue = aHtmlParserImpl.textInsideUrlValue;
    183   }
    184 
    185   @Override
    186   public boolean inJavascript() {
    187     return (insideJavascript
    188             && ( (getState() == STATE_VALUE)
    189                  || (currentState == CDATA_TEXT)
    190                  || (currentState == CDATA_COM_START)
    191                  || (currentState == CDATA_COM_START_DASH)
    192                  || (currentState == CDATA_COM_BODY)
    193                  || (currentState == CDATA_COM_DASH)
    194                  || (currentState == CDATA_COM_DASH_DASH)
    195                  || (currentState == CDATA_LT)
    196                  || (currentState == CDATA_MAY_CLOSE)
    197                  || (currentState == JS_FILE) ));
    198   }
    199 
    200   @Override
    201   public boolean isJavascriptQuoted() {
    202     if (inJavascript()) {
    203       ExternalState jsParserState = jsParser.getState();
    204       return (jsParserState == JavascriptParserImpl.STATE_Q
    205               || jsParserState == JavascriptParserImpl.STATE_DQ);
    206     }
    207     return false;
    208   }
    209 
    210   @Override
    211   public boolean inAttribute() {
    212     ExternalState extState = getState();
    213     return (extState != null && (extState == STATE_ATTR
    214                                  || extState == STATE_VALUE));
    215   }
    216 
    217   /**
    218    * Returns {@code true} if and only if the parser is currently within
    219    * a CSS context. A CSS context is one of the below:
    220    * <ul>
    221    * <li>Inside a STYLE tag.
    222    * <li>Inside a STYLE attribute.
    223    * <li>Inside a CSS file when the parser was reset in the CSS mode.
    224    * </ul>
    225    *
    226    * @return {@code true} if and only if the parser is inside CSS
    227    */
    228   @Override
    229   public boolean inCss() {
    230     return (currentState == CSS_FILE
    231             || (getState() == STATE_VALUE
    232                 && (getAttributeType() == ATTR_TYPE.STYLE))
    233             || ("style".equals(getTag())));
    234   }
    235 
    236   @Override
    237   public ATTR_TYPE getAttributeType() {
    238     String attribute = getAttribute();
    239     if (!inAttribute()) {
    240       return ATTR_TYPE.NONE;
    241     }
    242     if (HtmlUtils.isAttributeJavascript(attribute)) {
    243       return ATTR_TYPE.JS;
    244     }
    245     if (HtmlUtils.isAttributeUri(attribute)) {
    246       return ATTR_TYPE.URI;
    247     }
    248     if (HtmlUtils.isAttributeStyle(attribute)) {
    249       return ATTR_TYPE.STYLE;
    250     }
    251 
    252     // Special logic to handle the "content" attribute of the "meta" tag.
    253     if ("meta".equals(getTag()) && "content".equals(getAttribute())) {
    254       HtmlUtils.META_REDIRECT_TYPE redirectType =
    255           HtmlUtils.parseContentAttributeForUrl(getValue());
    256       if (redirectType == HtmlUtils.META_REDIRECT_TYPE.URL_START ||
    257           redirectType == HtmlUtils.META_REDIRECT_TYPE.URL)
    258         return ATTR_TYPE.URI;
    259     }
    260 
    261     return ATTR_TYPE.REGULAR;
    262   }
    263 
    264   @Override
    265   public ExternalState getJavascriptState() {
    266     return jsParser.getState();
    267   }
    268 
    269   @Override
    270   public boolean isAttributeQuoted() {
    271     return (currentState == VALUE_Q_START
    272             || currentState == VALUE_Q
    273             || currentState == VALUE_DQ_START
    274             || currentState == VALUE_DQ);
    275   }
    276 
    277   @Override
    278   public String getTag() {
    279     return tag.getContent().toLowerCase();
    280   }
    281 
    282   @Override
    283   public String getAttribute() {
    284     return inAttribute() ? attr.getContent().toLowerCase() : "";
    285   }
    286 
    287   @Override
    288   public String getValue() {
    289     return (getState() == STATE_VALUE) ? value.getContent() : "";
    290   }
    291 
    292   @Override
    293   public int getValueIndex() {
    294     if (getState() != STATE_VALUE) {
    295       return 0;
    296     }
    297     return valueIndex;
    298   }
    299 
    300   @Override
    301   public boolean isUrlStart() {
    302     // False when not inside an HTML attribute value
    303     if (getState() != STATE_VALUE) {
    304       return false;
    305     }
    306 
    307     //  Or when the HTML attribute is not of URI type.
    308     if (getAttributeType() != ATTR_TYPE.URI) {
    309       return false;
    310     }
    311 
    312     // Or when we received an InsertText() directive at the start of a URL.
    313     if (textInsideUrlValue) {
    314       return false;
    315     }
    316 
    317     if ("meta".equals(getTag())) {
    318       // At this point, we know we are in the "content" attribute
    319       // or we would not have the URI attribute type.
    320       return (HtmlUtils.parseContentAttributeForUrl(getValue()) ==
    321               HtmlUtils.META_REDIRECT_TYPE.URL_START);
    322     }
    323 
    324     // For all other URI attributes, check if we are at index 0.
    325     return (getValueIndex() == 0);
    326 }
    327 
    328   /**
    329    * {@inheritDoc}
    330    *
    331    * Resets the state of the parser to a state consistent with the
    332    * {@code Mode} provided. This will reset finer-grained state
    333    * information back to a default value, hence use only when
    334    * you want to parse text from a very clean slate.
    335    *
    336    * <p>See the {@link HtmlParser.Mode} enum for information on all
    337    * the valid modes.
    338    *
    339    * @param mode is an enum representing the high-level state of the parser
    340    */
    341   @Override
    342   public void resetMode(Mode mode) {
    343     insideJavascript = false;
    344     tag.reset();
    345     attr.reset();
    346     value.reset();
    347     cdataCloseTag.reset();
    348     valueIndex = 0;
    349     textInsideUrlValue = false;
    350     jsParser.reset();
    351 
    352     switch (mode) {
    353       case HTML:
    354         currentState = TEXT;
    355         break;
    356       case JS:
    357         currentState = JS_FILE;
    358         insideJavascript = true;
    359         break;
    360       case CSS:
    361         currentState = CSS_FILE;
    362         break;
    363       case HTML_IN_TAG:
    364         currentState = TAG_SPACE;
    365         break;
    366       default:
    367         throw new IllegalArgumentException("Did not recognize Mode: " +
    368                                            mode.toString());
    369     }
    370   }
    371 
    372   /**
    373    * Resets the state of the parser to the initial state of parsing HTML.
    374    */
    375   public void reset() {
    376     super.reset();
    377     resetMode(Mode.HTML);
    378   }
    379 
    380   /**
    381    * A specialized directive to tell the parser there is some content
    382    * that will be inserted here but that it will not get to parse. Used
    383    * by the template system that may not be able to give some content
    384    * to the parser but wants it to know there typically will be content
    385    * inserted at that point.  This is a hint used in corner cases within
    386    * parsing of HTML attribute names and values where content we do not
    387    * get to see could affect our parsing and alter our current state.
    388    *
    389    * <p>The two cases where {@code #insertText()} affects our parsing are:
    390    * <ul>
    391    * <li>We are at the start of the value of a URL-accepting HTML attribute. In
    392    * that case, we change internal state to no longer be considered at the
    393    * start of the URL. This may affect what escaping template systems may want
    394    * to perform on the HTML attribute value. We avoid injecting fake data and
    395    * hence not modify the current index of the value as determined by
    396    * {@link #getValueIndex()}</li>
    397    * <li>We just transitioned from an attribute name to an attribute value
    398    * (by parsing the separating {@code '='} character). In that case, we
    399    * change internal state to be now inside a non-quoted HTML attribute
    400    * value.</li>
    401    * </ul>
    402    *
    403    * @throws ParseException if an unrecoverable error occurred during parsing
    404    */
    405   @Override
    406   public void insertText() throws ParseException {
    407     // Case: Inside URL attribute value.
    408     if (getState() == STATE_VALUE
    409         && getAttributeType() == ATTR_TYPE.URI
    410         && isUrlStart()) {
    411       textInsideUrlValue = true;
    412     }
    413     // Case: Before parsing any attribute value.
    414     if (currentState == VALUE) {
    415       setNextState(VALUE_TEXT);
    416     }
    417   }
    418 
    419   @Override
    420   protected InternalState handleEnterState(InternalState currentState,
    421                                            InternalState expectedNextState,
    422                                            char input) {
    423     InternalState nextState = expectedNextState;
    424     if (currentState == TAG_NAME) {
    425       enterTagName();
    426     } else if (currentState == ATTR) {
    427       enterAttribute();
    428     } else if (currentState == TAG_CLOSE) {
    429       nextState = tagClose(currentState);
    430     } else if (currentState == CDATA_MAY_CLOSE) {
    431       enterStateCdataMayClose();
    432     } else if (currentState == VALUE) {
    433       enterValue();
    434     } else
    435     if (currentState == VALUE_TEXT || currentState == VALUE_Q
    436         || currentState == VALUE_DQ) {
    437       enterValueContent();
    438     }
    439     return nextState;
    440   }
    441 
    442   @Override
    443   protected InternalState handleExitState(InternalState currentState,
    444                                           InternalState expectedNextState,
    445                                           char input) {
    446     InternalState nextState = expectedNextState;
    447     if (currentState == TAG_NAME) {
    448       exitTagName();
    449     } else if (currentState == ATTR) {
    450       exitAttribute();
    451     } else if (currentState == CDATA_MAY_CLOSE) {
    452       nextState = exitStateCdataMayClose(nextState, input);
    453     } else
    454     if ((currentState == VALUE_TEXT) || (currentState == VALUE_Q)
    455         || (currentState == VALUE_DQ)) {
    456       exitValueContent();
    457     }
    458     return nextState;
    459   }
    460 
    461   @Override
    462   protected InternalState handleInState(InternalState currentState,
    463                                         char input) throws ParseException {
    464     if ((currentState == CDATA_TEXT)
    465         || (currentState == CDATA_COM_START)
    466         || (currentState == CDATA_COM_START_DASH)
    467         || (currentState == CDATA_COM_BODY)
    468         || (currentState == CDATA_COM_DASH)
    469         || (currentState == CDATA_COM_DASH_DASH)
    470         || (currentState == CDATA_LT)
    471         || (currentState == CDATA_MAY_CLOSE)
    472         || (currentState == JS_FILE)) {
    473       inStateCdata(input);
    474     } else if ((currentState == VALUE_TEXT)
    475                || (currentState == VALUE_Q)
    476                || (currentState == VALUE_DQ)) {
    477       inStateValue(input);
    478     }
    479     return currentState;
    480   }
    481 
    482   /**
    483    * Invokes recording on all CharacterRecorder objects. Currently we do
    484    * not check that one and only one of them is recording. I did a fair
    485    * bit of testing on the C++ parser and was not convinced there is
    486    * such a guarantee.
    487    */
    488   @Override
    489   protected void record(char input) {
    490     attr.maybeRecord(input);
    491     tag.maybeRecord(input);
    492     value.maybeRecord(input);
    493     cdataCloseTag.maybeRecord(input);
    494   }
    495 
    496   /**
    497    * Starts recording the name of the HTML tag. Called when the parser
    498    * enters a new tag.
    499    */
    500   private void enterTagName() {
    501     tag.startRecording();
    502   }
    503 
    504   private void exitTagName() {
    505     tag.stopRecording();
    506     String tagString = tag.getContent();
    507     if (!tagString.isEmpty() && tagString.charAt(0) == '/') {
    508       tag.reset();
    509     }
    510   }
    511 
    512   /**
    513    * Starts recording the name of the HTML attribute. Called when the parser
    514    * enters a new HTML attribute.
    515    */
    516   private void enterAttribute() {
    517     attr.startRecording();
    518   }
    519 
    520   private void exitAttribute() {
    521     attr.stopRecording();
    522   }
    523 
    524   /**
    525    * Tracks the index within the HTML attribute value and initializes
    526    * the javascript parser for attributes that take javascript.
    527    *
    528    * Called when the parser enters a new HTML attribute value.
    529    */
    530   private void enterValue() {
    531     valueIndex = 0;
    532     textInsideUrlValue = false;
    533     if (HtmlUtils.isAttributeJavascript(getAttribute())) {
    534       entityResolver.reset();
    535       jsParser.reset();
    536       insideJavascript = true;
    537     } else {
    538       insideJavascript = false;
    539     }
    540   }
    541 
    542   /**
    543    * Starts recordning the contents of the attribute value.
    544    *
    545    * Called when entering an attribute value.
    546    */
    547   private void enterValueContent() {
    548     value.startRecording();
    549   }
    550 
    551   /**
    552    * Stops the recording of the attribute value and exits javascript
    553    * (in case we were inside it).
    554    */
    555   private void exitValueContent() {
    556     value.stopRecording();
    557     insideJavascript = false;
    558   }
    559 
    560   /**
    561    * Processes javascript after performing entity resolution and updates
    562    * the position within the attribute value.
    563    * If the status of the entity resolution is <code>IN_PROGRESS</code>,
    564    * we don't invoke the javascript parser.
    565    *
    566    * <p>Called for every character inside an attribute value.
    567    *
    568    * @param input character read
    569    * @throws ParseException if an unrecoverable error occurred during parsing
    570    */
    571   private void inStateValue(char input) throws ParseException {
    572     valueIndex++;
    573     if (insideJavascript) {
    574       EntityResolver.Status status = entityResolver.processChar(input);
    575       if (status == EntityResolver.Status.COMPLETED) {
    576         jsParser.parse(entityResolver.getEntity());
    577         entityResolver.reset();
    578       } else if (status == EntityResolver.Status.NOT_STARTED) {
    579         jsParser.parse(input);
    580       }
    581     }
    582   }
    583 
    584   /**
    585    * Handles the tag it finished reading.
    586    *
    587    * <p>For a script tag, it initializes the javascript parser. For all
    588    * tags that are recognized to have CDATA values
    589    * (including the script tag), it switches the CDATA state to handle them
    590    * properly. For code simplification, CDATA and RCDATA sections are
    591    * treated the same.
    592    *
    593    * <p>Called when the parser leaves a tag definition.
    594    *
    595    * @param state current state
    596    * @return state next state, could be the same as current state
    597    */
    598   private InternalState tagClose(InternalState state) {
    599     InternalState nextState = state;
    600     String tagName = getTag();
    601     if ("script".equals(tagName)) {
    602       nextState = CDATA_TEXT;
    603       jsParser.reset();
    604       insideJavascript = true;
    605     } else if ("style".equals(tagName)
    606                  || "title".equals(tagName)
    607                  || "textarea".equals(tagName)) {
    608       nextState = CDATA_TEXT;
    609       insideJavascript = false;
    610     }
    611     return nextState;
    612   }
    613 
    614   /**
    615    * Feeds the character to the javascript parser for processing.
    616    *
    617    * <p>Called inside CDATA blocks to parse javascript.
    618    *
    619    * @param input character read
    620    * @throws ParseException if an unrecoverable error occurred during parsing
    621    */
    622   private void inStateCdata(char input) throws ParseException {
    623     if (insideJavascript) {
    624       jsParser.parse(input);
    625     }
    626   }
    627 
    628   /**
    629    * Starts recording. This is so we find the closing tag name in order to
    630    * know if the tag is going to be closed or not.
    631    *
    632    * <p>Called when encountering a '<' character in a CDATA section.
    633    */
    634   private void enterStateCdataMayClose() {
    635     cdataCloseTag.startRecording();
    636   }
    637 
    638   /**
    639    * Determines whether to close the tag element, It closes it if it finds
    640    * the corresponding end tag. Called when reading what could be a
    641    * closing CDATA tag.
    642    *
    643    * @param input the character read
    644    * @param expectedNextState the expected state to go to next
    645    *        unless we want to change it here
    646    * @return the next state to go to
    647    */
    648   private InternalState exitStateCdataMayClose(
    649       InternalState expectedNextState,
    650       char input) {
    651     InternalState nextState = expectedNextState;
    652     cdataCloseTag.stopRecording();
    653     String cdataCloseTagString = cdataCloseTag.getContent();
    654     Preconditions.checkState(!cdataCloseTagString.isEmpty()
    655         && cdataCloseTagString.charAt(0) == '/');  // Developer error.
    656 
    657     if (cdataCloseTagString.substring(1).equalsIgnoreCase(getTag())
    658         && (input == '>' || HtmlUtils.isHtmlSpace(input))) {
    659       tag.clear();
    660       insideJavascript = false;
    661     } else {
    662       nextState = CDATA_TEXT;
    663     }
    664     return nextState;
    665   }
    666 
    667 
    668   // ======================================================= //
    669   // SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE.     //
    670   // ======================================================= //
    671 
    672   private static void registerMapping(InternalState internalState,
    673                                       ExternalState externalState) {
    674     STATE_MAPPING.put(internalState, externalState);
    675   }
    676 
    677   private static void initializeStateMapping() {
    678     // Each parser implementation must map the error state appropriately.
    679     registerMapping(InternalState.INTERNAL_ERROR_STATE, HtmlParser.STATE_ERROR);
    680 
    681     registerMapping(TEXT, HtmlParser.STATE_TEXT);
    682     registerMapping(TAG_START, HtmlParser.STATE_TAG);
    683     registerMapping(TAG_NAME, HtmlParser.STATE_TAG);
    684     registerMapping(DECL_START, HtmlParser.STATE_TEXT);
    685     registerMapping(DECL_BODY, HtmlParser.STATE_TEXT);
    686     registerMapping(COM_OPEN, HtmlParser.STATE_TEXT);
    687     registerMapping(COM_BODY, HtmlParser.STATE_COMMENT);
    688     registerMapping(COM_DASH, HtmlParser.STATE_COMMENT);
    689     registerMapping(COM_DASH_DASH, HtmlParser.STATE_COMMENT);
    690     registerMapping(PI, HtmlParser.STATE_TEXT);
    691     registerMapping(PI_MAY_END, HtmlParser.STATE_TEXT);
    692     registerMapping(TAG_SPACE, HtmlParser.STATE_TAG);
    693     registerMapping(TAG_CLOSE, HtmlParser.STATE_TEXT);
    694     registerMapping(ATTR, HtmlParser.STATE_ATTR);
    695     registerMapping(ATTR_SPACE, HtmlParser.STATE_ATTR);
    696     registerMapping(VALUE, HtmlParser.STATE_VALUE);
    697     registerMapping(VALUE_TEXT, HtmlParser.STATE_VALUE);
    698     registerMapping(VALUE_Q_START, HtmlParser.STATE_VALUE);
    699     registerMapping(VALUE_Q, HtmlParser.STATE_VALUE);
    700     registerMapping(VALUE_DQ_START, HtmlParser.STATE_VALUE);
    701     registerMapping(VALUE_DQ, HtmlParser.STATE_VALUE);
    702     registerMapping(CDATA_COM_START, HtmlParser.STATE_TEXT);
    703     registerMapping(CDATA_COM_START_DASH, HtmlParser.STATE_TEXT);
    704     registerMapping(CDATA_COM_BODY, HtmlParser.STATE_TEXT);
    705     registerMapping(CDATA_COM_DASH, HtmlParser.STATE_TEXT);
    706     registerMapping(CDATA_COM_DASH_DASH, HtmlParser.STATE_TEXT);
    707     registerMapping(CDATA_TEXT, HtmlParser.STATE_TEXT);
    708     registerMapping(CDATA_LT, HtmlParser.STATE_TEXT);
    709     registerMapping(CDATA_MAY_CLOSE, HtmlParser.STATE_TEXT);
    710     registerMapping(JS_FILE, HtmlParser.STATE_JS_FILE);
    711     registerMapping(CSS_FILE, HtmlParser.STATE_CSS_FILE);
    712   }
    713 
    714   private static void registerTransition(String expression,
    715                                          InternalState source,
    716                                          InternalState to) {
    717     // It seems to silly to go through a StateTableTransition here
    718     // but it adds extra data checking.
    719     StateTableTransition stt = new StateTableTransition(expression,
    720                                                         source, to);
    721     STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(),
    722                               stt.getTo());
    723   }
    724 
    725   // NOTE: The "[:default:]" transition should be registered before any
    726   //   other transitions for a given state or it will over-write them.
    727   private static void initializeParserStateTable() {
    728     registerTransition("[:default:]", CSS_FILE, CSS_FILE);
    729     registerTransition("[:default:]", JS_FILE, JS_FILE);
    730     registerTransition("[:default:]", CDATA_MAY_CLOSE, CDATA_TEXT);
    731     registerTransition(" \t\n\r", CDATA_MAY_CLOSE, TAG_SPACE);
    732     registerTransition(">", CDATA_MAY_CLOSE, TEXT);
    733     registerTransition("A-Za-z0-9/_:-", CDATA_MAY_CLOSE, CDATA_MAY_CLOSE);
    734     registerTransition("[:default:]", CDATA_LT, CDATA_TEXT);
    735     registerTransition("!", CDATA_LT, CDATA_COM_START);
    736     registerTransition("/", CDATA_LT, CDATA_MAY_CLOSE);
    737     registerTransition("[:default:]", CDATA_TEXT, CDATA_TEXT);
    738     registerTransition("<", CDATA_TEXT, CDATA_LT);
    739     registerTransition("[:default:]", CDATA_COM_DASH_DASH, CDATA_COM_BODY);
    740     registerTransition(">", CDATA_COM_DASH_DASH, CDATA_TEXT);
    741     registerTransition("-", CDATA_COM_DASH_DASH, CDATA_COM_DASH_DASH);
    742     registerTransition("[:default:]", CDATA_COM_DASH, CDATA_COM_BODY);
    743     registerTransition("-", CDATA_COM_DASH, CDATA_COM_DASH_DASH);
    744     registerTransition("[:default:]", CDATA_COM_BODY, CDATA_COM_BODY);
    745     registerTransition("-", CDATA_COM_BODY, CDATA_COM_DASH);
    746     registerTransition("[:default:]", CDATA_COM_START_DASH, CDATA_TEXT);
    747     registerTransition("-", CDATA_COM_START_DASH, CDATA_COM_BODY);
    748     registerTransition("[:default:]", CDATA_COM_START, CDATA_TEXT);
    749     registerTransition("-", CDATA_COM_START, CDATA_COM_START_DASH);
    750     registerTransition("[:default:]", VALUE_DQ, VALUE_DQ);
    751     registerTransition("\"", VALUE_DQ, TAG_SPACE);
    752     registerTransition("[:default:]", VALUE_DQ_START, VALUE_DQ);
    753     registerTransition("\"", VALUE_DQ_START, TAG_SPACE);
    754     registerTransition("[:default:]", VALUE_Q, VALUE_Q);
    755     registerTransition("\'", VALUE_Q, TAG_SPACE);
    756     registerTransition("[:default:]", VALUE_Q_START, VALUE_Q);
    757     registerTransition("\'", VALUE_Q_START, TAG_SPACE);
    758     registerTransition("[:default:]", VALUE_TEXT, VALUE_TEXT);
    759     registerTransition(" \t\n\r", VALUE_TEXT, TAG_SPACE);
    760     registerTransition(">", VALUE_TEXT, TAG_CLOSE);
    761     registerTransition("[:default:]", VALUE, VALUE_TEXT);
    762     registerTransition(">", VALUE, TAG_CLOSE);
    763     registerTransition(" \t\n\r", VALUE, VALUE);
    764     registerTransition("\"", VALUE, VALUE_DQ_START);
    765     registerTransition("\'", VALUE, VALUE_Q_START);
    766     registerTransition("=", ATTR_SPACE, VALUE);
    767     registerTransition("/", ATTR_SPACE, TAG_SPACE);
    768     registerTransition("A-Za-z0-9_:-", ATTR_SPACE, ATTR);
    769     registerTransition(" \t\n\r", ATTR_SPACE, ATTR_SPACE);
    770     registerTransition(">", ATTR_SPACE, TAG_CLOSE);
    771     registerTransition(" \t\n\r", ATTR, ATTR_SPACE);
    772     registerTransition("=", ATTR, VALUE);
    773     registerTransition("/", ATTR, TAG_SPACE);
    774     registerTransition(">", ATTR, TAG_CLOSE);
    775     registerTransition("A-Za-z0-9_:.-", ATTR, ATTR);
    776     registerTransition("[:default:]", TAG_CLOSE, TEXT);
    777     registerTransition("<", TAG_CLOSE, TAG_START);
    778     registerTransition("/", TAG_SPACE, TAG_SPACE);
    779     registerTransition("A-Za-z0-9_:-", TAG_SPACE, ATTR);
    780     registerTransition(" \t\n\r", TAG_SPACE, TAG_SPACE);
    781     registerTransition(">", TAG_SPACE, TAG_CLOSE);
    782     registerTransition("[:default:]", PI_MAY_END, PI);
    783     registerTransition(">", PI_MAY_END, TEXT);
    784     registerTransition("[:default:]", PI, PI);
    785     registerTransition("?", PI, PI_MAY_END);
    786     registerTransition("[:default:]", COM_DASH_DASH, COM_BODY);
    787     registerTransition(">", COM_DASH_DASH, TEXT);
    788     registerTransition("-", COM_DASH_DASH, COM_DASH_DASH);
    789     registerTransition("[:default:]", COM_DASH, COM_BODY);
    790     registerTransition("-", COM_DASH, COM_DASH_DASH);
    791     registerTransition("[:default:]", COM_BODY, COM_BODY);
    792     registerTransition("-", COM_BODY, COM_DASH);
    793     registerTransition("[:default:]", COM_OPEN, TEXT);
    794     registerTransition("-", COM_OPEN, COM_BODY);
    795     registerTransition("[:default:]", DECL_BODY, DECL_BODY);
    796     registerTransition(">", DECL_BODY, TEXT);
    797     registerTransition("[:default:]", DECL_START, DECL_BODY);
    798     registerTransition(">", DECL_START, TEXT);
    799     registerTransition("-", DECL_START, COM_OPEN);
    800     registerTransition(">", TAG_NAME, TAG_CLOSE);
    801     registerTransition(" \t\n\r", TAG_NAME, TAG_SPACE);
    802     registerTransition("A-Za-z0-9/_:-", TAG_NAME, TAG_NAME);
    803 
    804     // Manual change to remain in-sync with CL 10597850 in C HtmlParser.
    805     registerTransition("[:default:]", TAG_START, TEXT);
    806     registerTransition("<", TAG_START, TAG_START);
    807     // End of manual change.
    808 
    809     registerTransition("!", TAG_START, DECL_START);
    810     registerTransition("?", TAG_START, PI);
    811     registerTransition("A-Za-z0-9/_:-", TAG_START, TAG_NAME);
    812     registerTransition("[:default:]", TEXT, TEXT);
    813     registerTransition("<", TEXT, TAG_START);
    814   }
    815 }
    816