Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
      3  * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
      4  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  * 1. Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26  */
     27 
     28 #include "config.h"
     29 #include "HTMLTokenizer.h"
     30 
     31 #include "HTMLEntityParser.h"
     32 #include "HTMLToken.h"
     33 #include "HTMLTreeBuilder.h"
     34 #include "HTMLNames.h"
     35 #include "NotImplemented.h"
     36 #include <wtf/ASCIICType.h>
     37 #include <wtf/CurrentTime.h>
     38 #include <wtf/UnusedParam.h>
     39 #include <wtf/text/AtomicString.h>
     40 #include <wtf/text/CString.h>
     41 #include <wtf/unicode/Unicode.h>
     42 
     43 using namespace WTF;
     44 
     45 namespace WebCore {
     46 
     47 using namespace HTMLNames;
     48 
     49 const UChar HTMLTokenizer::InputStreamPreprocessor::endOfFileMarker = 0;
     50 
     51 namespace {
     52 
     53 inline UChar toLowerCase(UChar cc)
     54 {
     55     ASSERT(isASCIIUpper(cc));
     56     const int lowerCaseOffset = 0x20;
     57     return cc + lowerCaseOffset;
     58 }
     59 
     60 inline bool isTokenizerWhitespace(UChar cc)
     61 {
     62     return cc == ' ' || cc == '\x0A' || cc == '\x09' || cc == '\x0C';
     63 }
     64 
     65 inline void advanceStringAndASSERTIgnoringCase(SegmentedString& source, const char* expectedCharacters)
     66 {
     67     while (*expectedCharacters)
     68         source.advanceAndASSERTIgnoringCase(*expectedCharacters++);
     69 }
     70 
     71 inline void advanceStringAndASSERT(SegmentedString& source, const char* expectedCharacters)
     72 {
     73     while (*expectedCharacters)
     74         source.advanceAndASSERT(*expectedCharacters++);
     75 }
     76 
     77 inline bool vectorEqualsString(const Vector<UChar, 32>& vector, const String& string)
     78 {
     79     if (vector.size() != string.length())
     80         return false;
     81     const UChar* stringData = string.characters();
     82     const UChar* vectorData = vector.data();
     83     // FIXME: Is there a higher-level function we should be calling here?
     84     return !memcmp(stringData, vectorData, vector.size() * sizeof(UChar));
     85 }
     86 
     87 inline bool isEndTagBufferingState(HTMLTokenizer::State state)
     88 {
     89     switch (state) {
     90     case HTMLTokenizer::RCDATAEndTagOpenState:
     91     case HTMLTokenizer::RCDATAEndTagNameState:
     92     case HTMLTokenizer::RAWTEXTEndTagOpenState:
     93     case HTMLTokenizer::RAWTEXTEndTagNameState:
     94     case HTMLTokenizer::ScriptDataEndTagOpenState:
     95     case HTMLTokenizer::ScriptDataEndTagNameState:
     96     case HTMLTokenizer::ScriptDataEscapedEndTagOpenState:
     97     case HTMLTokenizer::ScriptDataEscapedEndTagNameState:
     98         return true;
     99     default:
    100         return false;
    101     }
    102 }
    103 
    104 }
    105 
    106 HTMLTokenizer::HTMLTokenizer(bool usePreHTML5ParserQuirks)
    107     : m_inputStreamPreprocessor(this)
    108     , m_usePreHTML5ParserQuirks(usePreHTML5ParserQuirks)
    109 {
    110     reset();
    111 }
    112 
    113 HTMLTokenizer::~HTMLTokenizer()
    114 {
    115 }
    116 
    117 void HTMLTokenizer::reset()
    118 {
    119     m_state = DataState;
    120     m_token = 0;
    121     m_lineNumber = 0;
    122     m_skipLeadingNewLineForListing = false;
    123     m_forceNullCharacterReplacement = false;
    124     m_shouldAllowCDATA = false;
    125     m_additionalAllowedCharacter = '\0';
    126 }
    127 
    128 inline bool HTMLTokenizer::processEntity(SegmentedString& source)
    129 {
    130     bool notEnoughCharacters = false;
    131     Vector<UChar, 16> decodedEntity;
    132     bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters);
    133     if (notEnoughCharacters)
    134         return false;
    135     if (!success) {
    136         ASSERT(decodedEntity.isEmpty());
    137         bufferCharacter('&');
    138     } else {
    139         Vector<UChar>::const_iterator iter = decodedEntity.begin();
    140         for (; iter != decodedEntity.end(); ++iter)
    141             bufferCharacter(*iter);
    142     }
    143     return true;
    144 }
    145 
    146 #if COMPILER(MSVC)
    147 // We need to disable the "unreachable code" warning because we want to assert
    148 // that some code points aren't reached in the state machine.
    149 #pragma warning(disable: 4702)
    150 #endif
    151 
    152 #define BEGIN_STATE(stateName) case stateName: stateName:
    153 #define END_STATE() ASSERT_NOT_REACHED(); break;
    154 
    155 // We use this macro when the HTML5 spec says "reconsume the current input
    156 // character in the <mumble> state."
    157 #define RECONSUME_IN(stateName)                                            \
    158     do {                                                                   \
    159         m_state = stateName;                                               \
    160         goto stateName;                                                    \
    161     } while (false)
    162 
    163 // We use this macro when the HTML5 spec says "consume the next input
    164 // character ... and switch to the <mumble> state."
    165 #define ADVANCE_TO(stateName)                                              \
    166     do {                                                                   \
    167         m_state = stateName;                                               \
    168         if (!m_inputStreamPreprocessor.advance(source, m_lineNumber))      \
    169             return haveBufferedCharacterToken();                           \
    170         cc = m_inputStreamPreprocessor.nextInputCharacter();               \
    171         goto stateName;                                                    \
    172     } while (false)
    173 
    174 // Sometimes there's more complicated logic in the spec that separates when
    175 // we consume the next input character and when we switch to a particular
    176 // state. We handle those cases by advancing the source directly and using
    177 // this macro to switch to the indicated state.
    178 #define SWITCH_TO(stateName)                                               \
    179     do {                                                                   \
    180         m_state = stateName;                                               \
    181         if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source, m_lineNumber)) \
    182             return haveBufferedCharacterToken();                           \
    183         cc = m_inputStreamPreprocessor.nextInputCharacter();               \
    184         goto stateName;                                                    \
    185     } while (false)
    186 
    187 
    188 inline void HTMLTokenizer::saveEndTagNameIfNeeded()
    189 {
    190     ASSERT(m_token->type() != HTMLToken::Uninitialized);
    191     if (m_token->type() == HTMLToken::StartTag)
    192         m_appropriateEndTagName = m_token->name();
    193 }
    194 
    195 // We use this function when the HTML5 spec says "Emit the current <mumble>
    196 // token. Switch to the <mumble> state."  We use the word "resume" instead of
    197 // switch to indicate that this macro actually returns and that we'll end up
    198 // in the state when we "resume" (i.e., are called again).
    199 bool HTMLTokenizer::emitAndResumeIn(SegmentedString& source, State state)
    200 {
    201     m_state = state;
    202     source.advance(m_lineNumber);
    203     saveEndTagNameIfNeeded();
    204     return true;
    205 }
    206 
    207 // Identical to emitAndResumeIn, except does not advance.
    208 bool HTMLTokenizer::emitAndReconsumeIn(SegmentedString&, State state)
    209 {
    210     m_state = state;
    211     saveEndTagNameIfNeeded();
    212     return true;
    213 }
    214 
    215 // Used to emit the EndOfFile token.
    216 // Check if we have buffered characters to emit first before emitting the EOF.
    217 bool HTMLTokenizer::emitEndOfFile(SegmentedString& source)
    218 {
    219     if (haveBufferedCharacterToken())
    220         return true;
    221     m_state = DataState;
    222     source.advance(m_lineNumber);
    223     m_token->clear();
    224     m_token->makeEndOfFile();
    225     return true;
    226 }
    227 
    228 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
    229 {
    230     ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized);
    231     source.advance(m_lineNumber);
    232     if (m_token->type() == HTMLToken::Character)
    233         return true;
    234     m_token->beginEndTag(m_bufferedEndTagName);
    235     m_bufferedEndTagName.clear();
    236     return false;
    237 }
    238 
    239 #define FLUSH_AND_ADVANCE_TO(stateName)                                    \
    240     do {                                                                   \
    241         m_state = stateName;                                               \
    242         if (flushBufferedEndTag(source))                                   \
    243             return true;                                                   \
    244         if (source.isEmpty()                                               \
    245             || !m_inputStreamPreprocessor.peek(source, m_lineNumber))      \
    246             return haveBufferedCharacterToken();                           \
    247         cc = m_inputStreamPreprocessor.nextInputCharacter();               \
    248         goto stateName;                                                    \
    249     } while (false)
    250 
    251 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, State state)
    252 {
    253     m_state = state;
    254     flushBufferedEndTag(source);
    255     return true;
    256 }
    257 
    258 bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
    259 {
    260     // If we have a token in progress, then we're supposed to be called back
    261     // with the same token so we can finish it.
    262     ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized);
    263     m_token = &token;
    264 
    265     if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
    266         // FIXME: This should call flushBufferedEndTag().
    267         // We started an end tag during our last iteration.
    268         m_token->beginEndTag(m_bufferedEndTagName);
    269         m_bufferedEndTagName.clear();
    270         if (m_state == DataState) {
    271             // We're back in the data state, so we must be done with the tag.
    272             return true;
    273         }
    274     }
    275 
    276     if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source, m_lineNumber))
    277         return haveBufferedCharacterToken();
    278     UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
    279 
    280     // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
    281     // Note that this logic is different than the generic \r\n collapsing
    282     // handled in the input stream preprocessor. This logic is here as an
    283     // "authoring convenience" so folks can write:
    284     //
    285     // <pre>
    286     // lorem ipsum
    287     // lorem ipsum
    288     // </pre>
    289     //
    290     // without getting an extra newline at the start of their <pre> element.
    291     if (m_skipLeadingNewLineForListing) {
    292         m_skipLeadingNewLineForListing = false;
    293         if (cc == '\n') {
    294             if (m_state == DataState)
    295                 ADVANCE_TO(DataState);
    296             if (m_state == RCDATAState)
    297                 ADVANCE_TO(RCDATAState);
    298             // When parsing text/plain documents, we run the tokenizer in the
    299             // PLAINTEXTState and ignore m_skipLeadingNewLineForListing.
    300             ASSERT(m_state == PLAINTEXTState);
    301         }
    302     }
    303 
    304     // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
    305     switch (m_state) {
    306     BEGIN_STATE(DataState) {
    307         if (cc == '&')
    308             ADVANCE_TO(CharacterReferenceInDataState);
    309         else if (cc == '<') {
    310             if (m_token->type() == HTMLToken::Character) {
    311                 // We have a bunch of character tokens queued up that we
    312                 // are emitting lazily here.
    313                 return true;
    314             }
    315             ADVANCE_TO(TagOpenState);
    316         } else if (cc == InputStreamPreprocessor::endOfFileMarker)
    317             return emitEndOfFile(source);
    318         else {
    319             bufferCharacter(cc);
    320             ADVANCE_TO(DataState);
    321         }
    322     }
    323     END_STATE()
    324 
    325     BEGIN_STATE(CharacterReferenceInDataState) {
    326         if (!processEntity(source))
    327             return haveBufferedCharacterToken();
    328         SWITCH_TO(DataState);
    329     }
    330     END_STATE()
    331 
    332     BEGIN_STATE(RCDATAState) {
    333         if (cc == '&')
    334             ADVANCE_TO(CharacterReferenceInRCDATAState);
    335         else if (cc == '<')
    336             ADVANCE_TO(RCDATALessThanSignState);
    337         else if (cc == InputStreamPreprocessor::endOfFileMarker)
    338             return emitEndOfFile(source);
    339         else {
    340             bufferCharacter(cc);
    341             ADVANCE_TO(RCDATAState);
    342         }
    343     }
    344     END_STATE()
    345 
    346     BEGIN_STATE(CharacterReferenceInRCDATAState) {
    347         if (!processEntity(source))
    348             return haveBufferedCharacterToken();
    349         SWITCH_TO(RCDATAState);
    350     }
    351     END_STATE()
    352 
    353     BEGIN_STATE(RAWTEXTState) {
    354         if (cc == '<')
    355             ADVANCE_TO(RAWTEXTLessThanSignState);
    356         else if (cc == InputStreamPreprocessor::endOfFileMarker)
    357             return emitEndOfFile(source);
    358         else {
    359             bufferCharacter(cc);
    360             ADVANCE_TO(RAWTEXTState);
    361         }
    362     }
    363     END_STATE()
    364 
    365     BEGIN_STATE(ScriptDataState) {
    366         if (cc == '<')
    367             ADVANCE_TO(ScriptDataLessThanSignState);
    368         else if (cc == InputStreamPreprocessor::endOfFileMarker)
    369             return emitEndOfFile(source);
    370         else {
    371             bufferCharacter(cc);
    372             ADVANCE_TO(ScriptDataState);
    373         }
    374     }
    375     END_STATE()
    376 
    377     BEGIN_STATE(PLAINTEXTState) {
    378         if (cc == InputStreamPreprocessor::endOfFileMarker)
    379             return emitEndOfFile(source);
    380         else
    381             bufferCharacter(cc);
    382         ADVANCE_TO(PLAINTEXTState);
    383     }
    384     END_STATE()
    385 
    386     BEGIN_STATE(TagOpenState) {
    387         if (cc == '!')
    388             ADVANCE_TO(MarkupDeclarationOpenState);
    389         else if (cc == '/')
    390             ADVANCE_TO(EndTagOpenState);
    391         else if (isASCIIUpper(cc)) {
    392             m_token->beginStartTag(toLowerCase(cc));
    393             ADVANCE_TO(TagNameState);
    394         } else if (isASCIILower(cc)) {
    395             m_token->beginStartTag(cc);
    396             ADVANCE_TO(TagNameState);
    397         } else if (cc == '?') {
    398             parseError();
    399             // The spec consumes the current character before switching
    400             // to the bogus comment state, but it's easier to implement
    401             // if we reconsume the current character.
    402             RECONSUME_IN(BogusCommentState);
    403         } else {
    404             parseError();
    405             bufferCharacter('<');
    406             RECONSUME_IN(DataState);
    407         }
    408     }
    409     END_STATE()
    410 
    411     BEGIN_STATE(EndTagOpenState) {
    412         if (isASCIIUpper(cc)) {
    413             m_token->beginEndTag(toLowerCase(cc));
    414             ADVANCE_TO(TagNameState);
    415         } else if (isASCIILower(cc)) {
    416             m_token->beginEndTag(cc);
    417             ADVANCE_TO(TagNameState);
    418         } else if (cc == '>') {
    419             parseError();
    420             ADVANCE_TO(DataState);
    421         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
    422             parseError();
    423             bufferCharacter('<');
    424             bufferCharacter('/');
    425             RECONSUME_IN(DataState);
    426         } else {
    427             parseError();
    428             RECONSUME_IN(BogusCommentState);
    429         }
    430     }
    431     END_STATE()
    432 
    433     BEGIN_STATE(TagNameState) {
    434         if (isTokenizerWhitespace(cc))
    435             ADVANCE_TO(BeforeAttributeNameState);
    436         else if (cc == '/')
    437             ADVANCE_TO(SelfClosingStartTagState);
    438         else if (cc == '>')
    439             return emitAndResumeIn(source, DataState);
    440         else if (m_usePreHTML5ParserQuirks && cc == '<')
    441             return emitAndReconsumeIn(source, DataState);
    442         else if (isASCIIUpper(cc)) {
    443             m_token->appendToName(toLowerCase(cc));
    444             ADVANCE_TO(TagNameState);
    445         } if (cc == InputStreamPreprocessor::endOfFileMarker) {
    446             parseError();
    447             RECONSUME_IN(DataState);
    448         } else {
    449             m_token->appendToName(cc);
    450             ADVANCE_TO(TagNameState);
    451         }
    452     }
    453     END_STATE()
    454 
    455     BEGIN_STATE(RCDATALessThanSignState) {
    456         if (cc == '/') {
    457             m_temporaryBuffer.clear();
    458             ASSERT(m_bufferedEndTagName.isEmpty());
    459             ADVANCE_TO(RCDATAEndTagOpenState);
    460         } else {
    461             bufferCharacter('<');
    462             RECONSUME_IN(RCDATAState);
    463         }
    464     }
    465     END_STATE()
    466 
    467     BEGIN_STATE(RCDATAEndTagOpenState) {
    468         if (isASCIIUpper(cc)) {
    469             m_temporaryBuffer.append(cc);
    470             addToPossibleEndTag(toLowerCase(cc));
    471             ADVANCE_TO(RCDATAEndTagNameState);
    472         } else if (isASCIILower(cc)) {
    473             m_temporaryBuffer.append(cc);
    474             addToPossibleEndTag(cc);
    475             ADVANCE_TO(RCDATAEndTagNameState);
    476         } else {
    477             bufferCharacter('<');
    478             bufferCharacter('/');
    479             RECONSUME_IN(RCDATAState);
    480         }
    481     }
    482     END_STATE()
    483 
    484     BEGIN_STATE(RCDATAEndTagNameState) {
    485         if (isASCIIUpper(cc)) {
    486             m_temporaryBuffer.append(cc);
    487             addToPossibleEndTag(toLowerCase(cc));
    488             ADVANCE_TO(RCDATAEndTagNameState);
    489         } else if (isASCIILower(cc)) {
    490             m_temporaryBuffer.append(cc);
    491             addToPossibleEndTag(cc);
    492             ADVANCE_TO(RCDATAEndTagNameState);
    493         } else {
    494             if (isTokenizerWhitespace(cc)) {
    495                 if (isAppropriateEndTag())
    496                     FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
    497             } else if (cc == '/') {
    498                 if (isAppropriateEndTag())
    499                     FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
    500             } else if (cc == '>') {
    501                 if (isAppropriateEndTag())
    502                     return flushEmitAndResumeIn(source, DataState);
    503             }
    504             bufferCharacter('<');
    505             bufferCharacter('/');
    506             m_token->appendToCharacter(m_temporaryBuffer);
    507             m_bufferedEndTagName.clear();
    508             RECONSUME_IN(RCDATAState);
    509         }
    510     }
    511     END_STATE()
    512 
    513     BEGIN_STATE(RAWTEXTLessThanSignState) {
    514         if (cc == '/') {
    515             m_temporaryBuffer.clear();
    516             ASSERT(m_bufferedEndTagName.isEmpty());
    517             ADVANCE_TO(RAWTEXTEndTagOpenState);
    518         } else {
    519             bufferCharacter('<');
    520             RECONSUME_IN(RAWTEXTState);
    521         }
    522     }
    523     END_STATE()
    524 
    525     BEGIN_STATE(RAWTEXTEndTagOpenState) {
    526         if (isASCIIUpper(cc)) {
    527             m_temporaryBuffer.append(cc);
    528             addToPossibleEndTag(toLowerCase(cc));
    529             ADVANCE_TO(RAWTEXTEndTagNameState);
    530         } else if (isASCIILower(cc)) {
    531             m_temporaryBuffer.append(cc);
    532             addToPossibleEndTag(cc);
    533             ADVANCE_TO(RAWTEXTEndTagNameState);
    534         } else {
    535             bufferCharacter('<');
    536             bufferCharacter('/');
    537             RECONSUME_IN(RAWTEXTState);
    538         }
    539     }
    540     END_STATE()
    541 
    542     BEGIN_STATE(RAWTEXTEndTagNameState) {
    543         if (isASCIIUpper(cc)) {
    544             m_temporaryBuffer.append(cc);
    545             addToPossibleEndTag(toLowerCase(cc));
    546             ADVANCE_TO(RAWTEXTEndTagNameState);
    547         } else if (isASCIILower(cc)) {
    548             m_temporaryBuffer.append(cc);
    549             addToPossibleEndTag(cc);
    550             ADVANCE_TO(RAWTEXTEndTagNameState);
    551         } else {
    552             if (isTokenizerWhitespace(cc)) {
    553                 if (isAppropriateEndTag())
    554                     FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
    555             } else if (cc == '/') {
    556                 if (isAppropriateEndTag())
    557                     FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
    558             } else if (cc == '>') {
    559                 if (isAppropriateEndTag())
    560                     return flushEmitAndResumeIn(source, DataState);
    561             }
    562             bufferCharacter('<');
    563             bufferCharacter('/');
    564             m_token->appendToCharacter(m_temporaryBuffer);
    565             m_bufferedEndTagName.clear();
    566             RECONSUME_IN(RAWTEXTState);
    567         }
    568     }
    569     END_STATE()
    570 
    571     BEGIN_STATE(ScriptDataLessThanSignState) {
    572         if (cc == '/') {
    573             m_temporaryBuffer.clear();
    574             ASSERT(m_bufferedEndTagName.isEmpty());
    575             ADVANCE_TO(ScriptDataEndTagOpenState);
    576         } else if (cc == '!') {
    577             bufferCharacter('<');
    578             bufferCharacter('!');
    579             ADVANCE_TO(ScriptDataEscapeStartState);
    580         } else {
    581             bufferCharacter('<');
    582             RECONSUME_IN(ScriptDataState);
    583         }
    584     }
    585     END_STATE()
    586 
    587     BEGIN_STATE(ScriptDataEndTagOpenState) {
    588         if (isASCIIUpper(cc)) {
    589             m_temporaryBuffer.append(cc);
    590             addToPossibleEndTag(toLowerCase(cc));
    591             ADVANCE_TO(ScriptDataEndTagNameState);
    592         } else if (isASCIILower(cc)) {
    593             m_temporaryBuffer.append(cc);
    594             addToPossibleEndTag(cc);
    595             ADVANCE_TO(ScriptDataEndTagNameState);
    596         } else {
    597             bufferCharacter('<');
    598             bufferCharacter('/');
    599             RECONSUME_IN(ScriptDataState);
    600         }
    601     }
    602     END_STATE()
    603 
    604     BEGIN_STATE(ScriptDataEndTagNameState) {
    605         if (isASCIIUpper(cc)) {
    606             m_temporaryBuffer.append(cc);
    607             addToPossibleEndTag(toLowerCase(cc));
    608             ADVANCE_TO(ScriptDataEndTagNameState);
    609         } else if (isASCIILower(cc)) {
    610             m_temporaryBuffer.append(cc);
    611             addToPossibleEndTag(cc);
    612             ADVANCE_TO(ScriptDataEndTagNameState);
    613         } else {
    614             if (isTokenizerWhitespace(cc)) {
    615                 if (isAppropriateEndTag())
    616                     FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
    617             } else if (cc == '/') {
    618                 if (isAppropriateEndTag())
    619                     FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
    620             } else if (cc == '>') {
    621                 if (isAppropriateEndTag())
    622                     return flushEmitAndResumeIn(source, DataState);
    623             }
    624             bufferCharacter('<');
    625             bufferCharacter('/');
    626             m_token->appendToCharacter(m_temporaryBuffer);
    627             m_bufferedEndTagName.clear();
    628             RECONSUME_IN(ScriptDataState);
    629         }
    630     }
    631     END_STATE()
    632 
    633     BEGIN_STATE(ScriptDataEscapeStartState) {
    634         if (cc == '-') {
    635             bufferCharacter(cc);
    636             ADVANCE_TO(ScriptDataEscapeStartDashState);
    637         } else
    638             RECONSUME_IN(ScriptDataState);
    639     }
    640     END_STATE()
    641 
    642     BEGIN_STATE(ScriptDataEscapeStartDashState) {
    643         if (cc == '-') {
    644             bufferCharacter(cc);
    645             ADVANCE_TO(ScriptDataEscapedDashDashState);
    646         } else
    647             RECONSUME_IN(ScriptDataState);
    648     }
    649     END_STATE()
    650 
    651     BEGIN_STATE(ScriptDataEscapedState) {
    652         if (cc == '-') {
    653             bufferCharacter(cc);
    654             ADVANCE_TO(ScriptDataEscapedDashState);
    655         } else if (cc == '<')
    656             ADVANCE_TO(ScriptDataEscapedLessThanSignState);
    657         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
    658             parseError();
    659             RECONSUME_IN(DataState);
    660         } else {
    661             bufferCharacter(cc);
    662             ADVANCE_TO(ScriptDataEscapedState);
    663         }
    664     }
    665     END_STATE()
    666 
    667     BEGIN_STATE(ScriptDataEscapedDashState) {
    668         if (cc == '-') {
    669             bufferCharacter(cc);
    670             ADVANCE_TO(ScriptDataEscapedDashDashState);
    671         } else if (cc == '<')
    672             ADVANCE_TO(ScriptDataEscapedLessThanSignState);
    673         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
    674             parseError();
    675             RECONSUME_IN(DataState);
    676         } else {
    677             bufferCharacter(cc);
    678             ADVANCE_TO(ScriptDataEscapedState);
    679         }
    680     }
    681     END_STATE()
    682 
    683     BEGIN_STATE(ScriptDataEscapedDashDashState) {
    684         if (cc == '-') {
    685             bufferCharacter(cc);
    686             ADVANCE_TO(ScriptDataEscapedDashDashState);
    687         } else if (cc == '<')
    688             ADVANCE_TO(ScriptDataEscapedLessThanSignState);
    689         else if (cc == '>') {
    690             bufferCharacter(cc);
    691             ADVANCE_TO(ScriptDataState);
    692         } if (cc == InputStreamPreprocessor::endOfFileMarker) {
    693             parseError();
    694             RECONSUME_IN(DataState);
    695         } else {
    696             bufferCharacter(cc);
    697             ADVANCE_TO(ScriptDataEscapedState);
    698         }
    699     }
    700     END_STATE()
    701 
    702     BEGIN_STATE(ScriptDataEscapedLessThanSignState) {
    703         if (cc == '/') {
    704             m_temporaryBuffer.clear();
    705             ASSERT(m_bufferedEndTagName.isEmpty());
    706             ADVANCE_TO(ScriptDataEscapedEndTagOpenState);
    707         } else if (isASCIIUpper(cc)) {
    708             bufferCharacter('<');
    709             bufferCharacter(cc);
    710             m_temporaryBuffer.clear();
    711             m_temporaryBuffer.append(toLowerCase(cc));
    712             ADVANCE_TO(ScriptDataDoubleEscapeStartState);
    713         } else if (isASCIILower(cc)) {
    714             bufferCharacter('<');
    715             bufferCharacter(cc);
    716             m_temporaryBuffer.clear();
    717             m_temporaryBuffer.append(cc);
    718             ADVANCE_TO(ScriptDataDoubleEscapeStartState);
    719         } else {
    720             bufferCharacter('<');
    721             RECONSUME_IN(ScriptDataEscapedState);
    722         }
    723     }
    724     END_STATE()
    725 
    726     BEGIN_STATE(ScriptDataEscapedEndTagOpenState) {
    727         if (isASCIIUpper(cc)) {
    728             m_temporaryBuffer.append(cc);
    729             addToPossibleEndTag(toLowerCase(cc));
    730             ADVANCE_TO(ScriptDataEscapedEndTagNameState);
    731         } else if (isASCIILower(cc)) {
    732             m_temporaryBuffer.append(cc);
    733             addToPossibleEndTag(cc);
    734             ADVANCE_TO(ScriptDataEscapedEndTagNameState);
    735         } else {
    736             bufferCharacter('<');
    737             bufferCharacter('/');
    738             RECONSUME_IN(ScriptDataEscapedState);
    739         }
    740     }
    741     END_STATE()
    742 
    743     BEGIN_STATE(ScriptDataEscapedEndTagNameState) {
    744         if (isASCIIUpper(cc)) {
    745             m_temporaryBuffer.append(cc);
    746             addToPossibleEndTag(toLowerCase(cc));
    747             ADVANCE_TO(ScriptDataEscapedEndTagNameState);
    748         } else if (isASCIILower(cc)) {
    749             m_temporaryBuffer.append(cc);
    750             addToPossibleEndTag(cc);
    751             ADVANCE_TO(ScriptDataEscapedEndTagNameState);
    752         } else {
    753             if (isTokenizerWhitespace(cc)) {
    754                 if (isAppropriateEndTag())
    755                     FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
    756             } else if (cc == '/') {
    757                 if (isAppropriateEndTag())
    758                     FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
    759             } else if (cc == '>') {
    760                 if (isAppropriateEndTag())
    761                     return flushEmitAndResumeIn(source, DataState);
    762             }
    763             bufferCharacter('<');
    764             bufferCharacter('/');
    765             m_token->appendToCharacter(m_temporaryBuffer);
    766             m_bufferedEndTagName.clear();
    767             RECONSUME_IN(ScriptDataEscapedState);
    768         }
    769     }
    770     END_STATE()
    771 
    772     BEGIN_STATE(ScriptDataDoubleEscapeStartState) {
    773         if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
    774             bufferCharacter(cc);
    775             if (temporaryBufferIs(scriptTag.localName()))
    776                 ADVANCE_TO(ScriptDataDoubleEscapedState);
    777             else
    778                 ADVANCE_TO(ScriptDataEscapedState);
    779         } else if (isASCIIUpper(cc)) {
    780             bufferCharacter(cc);
    781             m_temporaryBuffer.append(toLowerCase(cc));
    782             ADVANCE_TO(ScriptDataDoubleEscapeStartState);
    783         } else if (isASCIILower(cc)) {
    784             bufferCharacter(cc);
    785             m_temporaryBuffer.append(cc);
    786             ADVANCE_TO(ScriptDataDoubleEscapeStartState);
    787         } else
    788             RECONSUME_IN(ScriptDataEscapedState);
    789     }
    790     END_STATE()
    791 
    792     BEGIN_STATE(ScriptDataDoubleEscapedState) {
    793         if (cc == '-') {
    794             bufferCharacter(cc);
    795             ADVANCE_TO(ScriptDataDoubleEscapedDashState);
    796         } else if (cc == '<') {
    797             bufferCharacter(cc);
    798             ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
    799         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
    800             parseError();
    801             RECONSUME_IN(DataState);
    802         } else {
    803             bufferCharacter(cc);
    804             ADVANCE_TO(ScriptDataDoubleEscapedState);
    805         }
    806     }
    807     END_STATE()
    808 
    809     BEGIN_STATE(ScriptDataDoubleEscapedDashState) {
    810         if (cc == '-') {
    811             bufferCharacter(cc);
    812             ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
    813         } else if (cc == '<') {
    814             bufferCharacter(cc);
    815             ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
    816         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
    817             parseError();
    818             RECONSUME_IN(DataState);
    819         } else {
    820             bufferCharacter(cc);
    821             ADVANCE_TO(ScriptDataDoubleEscapedState);
    822         }
    823     }
    824     END_STATE()
    825 
    826     BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) {
    827         if (cc == '-') {
    828             bufferCharacter(cc);
    829             ADVANCE_TO(ScriptDataDoubleEscapedDashDashState);
    830         } else if (cc == '<') {
    831             bufferCharacter(cc);
    832             ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState);
    833         } else if (cc == '>') {
    834             bufferCharacter(cc);
    835             ADVANCE_TO(ScriptDataState);
    836         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
    837             parseError();
    838             RECONSUME_IN(DataState);
    839         } else {
    840             bufferCharacter(cc);
    841             ADVANCE_TO(ScriptDataDoubleEscapedState);
    842         }
    843     }
    844     END_STATE()
    845 
    846     BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) {
    847         if (cc == '/') {
    848             bufferCharacter(cc);
    849             m_temporaryBuffer.clear();
    850             ADVANCE_TO(ScriptDataDoubleEscapeEndState);
    851         } else
    852             RECONSUME_IN(ScriptDataDoubleEscapedState);
    853     }
    854     END_STATE()
    855 
    856     BEGIN_STATE(ScriptDataDoubleEscapeEndState) {
    857         if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') {
    858             bufferCharacter(cc);
    859             if (temporaryBufferIs(scriptTag.localName()))
    860                 ADVANCE_TO(ScriptDataEscapedState);
    861             else
    862                 ADVANCE_TO(ScriptDataDoubleEscapedState);
    863         } else if (isASCIIUpper(cc)) {
    864             bufferCharacter(cc);
    865             m_temporaryBuffer.append(toLowerCase(cc));
    866             ADVANCE_TO(ScriptDataDoubleEscapeEndState);
    867         } else if (isASCIILower(cc)) {
    868             bufferCharacter(cc);
    869             m_temporaryBuffer.append(cc);
    870             ADVANCE_TO(ScriptDataDoubleEscapeEndState);
    871         } else
    872             RECONSUME_IN(ScriptDataDoubleEscapedState);
    873     }
    874     END_STATE()
    875 
    876     BEGIN_STATE(BeforeAttributeNameState) {
    877         if (isTokenizerWhitespace(cc))
    878             ADVANCE_TO(BeforeAttributeNameState);
    879         else if (cc == '/')
    880             ADVANCE_TO(SelfClosingStartTagState);
    881         else if (cc == '>')
    882             return emitAndResumeIn(source, DataState);
    883         else if (m_usePreHTML5ParserQuirks && cc == '<')
    884             return emitAndReconsumeIn(source, DataState);
    885         else if (isASCIIUpper(cc)) {
    886             m_token->addNewAttribute();
    887             m_token->beginAttributeName(source.numberOfCharactersConsumed());
    888             m_token->appendToAttributeName(toLowerCase(cc));
    889             ADVANCE_TO(AttributeNameState);
    890         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
    891             parseError();
    892             RECONSUME_IN(DataState);
    893         } else {
    894             if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
    895                 parseError();
    896             m_token->addNewAttribute();
    897             m_token->beginAttributeName(source.numberOfCharactersConsumed());
    898             m_token->appendToAttributeName(cc);
    899             ADVANCE_TO(AttributeNameState);
    900         }
    901     }
    902     END_STATE()
    903 
    904     BEGIN_STATE(AttributeNameState) {
    905         if (isTokenizerWhitespace(cc)) {
    906             m_token->endAttributeName(source.numberOfCharactersConsumed());
    907             ADVANCE_TO(AfterAttributeNameState);
    908         } else if (cc == '/') {
    909             m_token->endAttributeName(source.numberOfCharactersConsumed());
    910             ADVANCE_TO(SelfClosingStartTagState);
    911         } else if (cc == '=') {
    912             m_token->endAttributeName(source.numberOfCharactersConsumed());
    913             ADVANCE_TO(BeforeAttributeValueState);
    914         } else if (cc == '>') {
    915             m_token->endAttributeName(source.numberOfCharactersConsumed());
    916             return emitAndResumeIn(source, DataState);
    917         } else if (m_usePreHTML5ParserQuirks && cc == '<') {
    918             m_token->endAttributeName(source.numberOfCharactersConsumed());
    919             return emitAndReconsumeIn(source, DataState);
    920         } else if (isASCIIUpper(cc)) {
    921             m_token->appendToAttributeName(toLowerCase(cc));
    922             ADVANCE_TO(AttributeNameState);
    923         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
    924             parseError();
    925             m_token->endAttributeName(source.numberOfCharactersConsumed());
    926             RECONSUME_IN(DataState);
    927         } else {
    928             if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
    929                 parseError();
    930             m_token->appendToAttributeName(cc);
    931             ADVANCE_TO(AttributeNameState);
    932         }
    933     }
    934     END_STATE()
    935 
    936     BEGIN_STATE(AfterAttributeNameState) {
    937         if (isTokenizerWhitespace(cc))
    938             ADVANCE_TO(AfterAttributeNameState);
    939         else if (cc == '/')
    940             ADVANCE_TO(SelfClosingStartTagState);
    941         else if (cc == '=')
    942             ADVANCE_TO(BeforeAttributeValueState);
    943         else if (cc == '>')
    944             return emitAndResumeIn(source, DataState);
    945         else if (m_usePreHTML5ParserQuirks && cc == '<')
    946             return emitAndReconsumeIn(source, DataState);
    947         else if (isASCIIUpper(cc)) {
    948             m_token->addNewAttribute();
    949             m_token->beginAttributeName(source.numberOfCharactersConsumed());
    950             m_token->appendToAttributeName(toLowerCase(cc));
    951             ADVANCE_TO(AttributeNameState);
    952         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
    953             parseError();
    954             RECONSUME_IN(DataState);
    955         } else {
    956             if (cc == '"' || cc == '\'' || cc == '<')
    957                 parseError();
    958             m_token->addNewAttribute();
    959             m_token->beginAttributeName(source.numberOfCharactersConsumed());
    960             m_token->appendToAttributeName(cc);
    961             ADVANCE_TO(AttributeNameState);
    962         }
    963     }
    964     END_STATE()
    965 
    966     BEGIN_STATE(BeforeAttributeValueState) {
    967         if (isTokenizerWhitespace(cc))
    968             ADVANCE_TO(BeforeAttributeValueState);
    969         else if (cc == '"') {
    970             m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
    971             ADVANCE_TO(AttributeValueDoubleQuotedState);
    972         } else if (cc == '&') {
    973             m_token->beginAttributeValue(source.numberOfCharactersConsumed());
    974             RECONSUME_IN(AttributeValueUnquotedState);
    975         } else if (cc == '\'') {
    976             m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
    977             ADVANCE_TO(AttributeValueSingleQuotedState);
    978         } else if (cc == '>') {
    979             parseError();
    980             return emitAndResumeIn(source, DataState);
    981         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
    982             parseError();
    983             RECONSUME_IN(DataState);
    984         } else {
    985             if (cc == '<' || cc == '=' || cc == '`')
    986                 parseError();
    987             m_token->beginAttributeValue(source.numberOfCharactersConsumed());
    988             m_token->appendToAttributeValue(cc);
    989             ADVANCE_TO(AttributeValueUnquotedState);
    990         }
    991     }
    992     END_STATE()
    993 
    994     BEGIN_STATE(AttributeValueDoubleQuotedState) {
    995         if (cc == '"') {
    996             m_token->endAttributeValue(source.numberOfCharactersConsumed());
    997             ADVANCE_TO(AfterAttributeValueQuotedState);
    998         } else if (cc == '&') {
    999             m_additionalAllowedCharacter = '"';
   1000             ADVANCE_TO(CharacterReferenceInAttributeValueState);
   1001         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1002             parseError();
   1003             m_token->endAttributeValue(source.numberOfCharactersConsumed());
   1004             RECONSUME_IN(DataState);
   1005         } else {
   1006             m_token->appendToAttributeValue(cc);
   1007             ADVANCE_TO(AttributeValueDoubleQuotedState);
   1008         }
   1009     }
   1010     END_STATE()
   1011 
   1012     BEGIN_STATE(AttributeValueSingleQuotedState) {
   1013         if (cc == '\'') {
   1014             m_token->endAttributeValue(source.numberOfCharactersConsumed());
   1015             ADVANCE_TO(AfterAttributeValueQuotedState);
   1016         } else if (cc == '&') {
   1017             m_additionalAllowedCharacter = '\'';
   1018             ADVANCE_TO(CharacterReferenceInAttributeValueState);
   1019         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1020             parseError();
   1021             m_token->endAttributeValue(source.numberOfCharactersConsumed());
   1022             RECONSUME_IN(DataState);
   1023         } else {
   1024             m_token->appendToAttributeValue(cc);
   1025             ADVANCE_TO(AttributeValueSingleQuotedState);
   1026         }
   1027     }
   1028     END_STATE()
   1029 
   1030     BEGIN_STATE(AttributeValueUnquotedState) {
   1031         if (isTokenizerWhitespace(cc)) {
   1032             m_token->endAttributeValue(source.numberOfCharactersConsumed());
   1033             ADVANCE_TO(BeforeAttributeNameState);
   1034         } else if (cc == '&') {
   1035             m_additionalAllowedCharacter = '>';
   1036             ADVANCE_TO(CharacterReferenceInAttributeValueState);
   1037         } else if (cc == '>') {
   1038             m_token->endAttributeValue(source.numberOfCharactersConsumed());
   1039             return emitAndResumeIn(source, DataState);
   1040         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1041             parseError();
   1042             m_token->endAttributeValue(source.numberOfCharactersConsumed());
   1043             RECONSUME_IN(DataState);
   1044         } else {
   1045             if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
   1046                 parseError();
   1047             m_token->appendToAttributeValue(cc);
   1048             ADVANCE_TO(AttributeValueUnquotedState);
   1049         }
   1050     }
   1051     END_STATE()
   1052 
   1053     BEGIN_STATE(CharacterReferenceInAttributeValueState) {
   1054         bool notEnoughCharacters = false;
   1055         Vector<UChar, 16> decodedEntity;
   1056         bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter);
   1057         if (notEnoughCharacters)
   1058             return haveBufferedCharacterToken();
   1059         if (!success) {
   1060             ASSERT(decodedEntity.isEmpty());
   1061             m_token->appendToAttributeValue('&');
   1062         } else {
   1063             Vector<UChar>::const_iterator iter = decodedEntity.begin();
   1064             for (; iter != decodedEntity.end(); ++iter)
   1065                 m_token->appendToAttributeValue(*iter);
   1066         }
   1067         // We're supposed to switch back to the attribute value state that
   1068         // we were in when we were switched into this state. Rather than
   1069         // keeping track of this explictly, we observe that the previous
   1070         // state can be determined by m_additionalAllowedCharacter.
   1071         if (m_additionalAllowedCharacter == '"')
   1072             SWITCH_TO(AttributeValueDoubleQuotedState);
   1073         else if (m_additionalAllowedCharacter == '\'')
   1074             SWITCH_TO(AttributeValueSingleQuotedState);
   1075         else if (m_additionalAllowedCharacter == '>')
   1076             SWITCH_TO(AttributeValueUnquotedState);
   1077         else
   1078             ASSERT_NOT_REACHED();
   1079     }
   1080     END_STATE()
   1081 
   1082     BEGIN_STATE(AfterAttributeValueQuotedState) {
   1083         if (isTokenizerWhitespace(cc))
   1084             ADVANCE_TO(BeforeAttributeNameState);
   1085         else if (cc == '/')
   1086             ADVANCE_TO(SelfClosingStartTagState);
   1087         else if (cc == '>')
   1088             return emitAndResumeIn(source, DataState);
   1089         else if (m_usePreHTML5ParserQuirks && cc == '<')
   1090             return emitAndReconsumeIn(source, DataState);
   1091         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1092             parseError();
   1093             RECONSUME_IN(DataState);
   1094         } else {
   1095             parseError();
   1096             RECONSUME_IN(BeforeAttributeNameState);
   1097         }
   1098     }
   1099     END_STATE()
   1100 
   1101     BEGIN_STATE(SelfClosingStartTagState) {
   1102         if (cc == '>') {
   1103             m_token->setSelfClosing();
   1104             return emitAndResumeIn(source, DataState);
   1105         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1106             parseError();
   1107             RECONSUME_IN(DataState);
   1108         } else {
   1109             parseError();
   1110             RECONSUME_IN(BeforeAttributeNameState);
   1111         }
   1112     }
   1113     END_STATE()
   1114 
   1115     BEGIN_STATE(BogusCommentState) {
   1116         m_token->beginComment();
   1117         RECONSUME_IN(ContinueBogusCommentState);
   1118     }
   1119     END_STATE()
   1120 
   1121     BEGIN_STATE(ContinueBogusCommentState) {
   1122         if (cc == '>')
   1123             return emitAndResumeIn(source, DataState);
   1124         else if (cc == InputStreamPreprocessor::endOfFileMarker)
   1125             return emitAndReconsumeIn(source, DataState);
   1126         else {
   1127             m_token->appendToComment(cc);
   1128             ADVANCE_TO(ContinueBogusCommentState);
   1129         }
   1130     }
   1131     END_STATE()
   1132 
   1133     BEGIN_STATE(MarkupDeclarationOpenState) {
   1134         DEFINE_STATIC_LOCAL(String, dashDashString, ("--"));
   1135         DEFINE_STATIC_LOCAL(String, doctypeString, ("doctype"));
   1136         DEFINE_STATIC_LOCAL(String, cdataString, ("[CDATA["));
   1137         if (cc == '-') {
   1138             SegmentedString::LookAheadResult result = source.lookAhead(dashDashString);
   1139             if (result == SegmentedString::DidMatch) {
   1140                 source.advanceAndASSERT('-');
   1141                 source.advanceAndASSERT('-');
   1142                 m_token->beginComment();
   1143                 SWITCH_TO(CommentStartState);
   1144             } else if (result == SegmentedString::NotEnoughCharacters)
   1145                 return haveBufferedCharacterToken();
   1146         } else if (cc == 'D' || cc == 'd') {
   1147             SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString);
   1148             if (result == SegmentedString::DidMatch) {
   1149                 advanceStringAndASSERTIgnoringCase(source, "doctype");
   1150                 SWITCH_TO(DOCTYPEState);
   1151             } else if (result == SegmentedString::NotEnoughCharacters)
   1152                 return haveBufferedCharacterToken();
   1153         } else if (cc == '[' && shouldAllowCDATA()) {
   1154             SegmentedString::LookAheadResult result = source.lookAhead(cdataString);
   1155             if (result == SegmentedString::DidMatch) {
   1156                 advanceStringAndASSERT(source, "[CDATA[");
   1157                 SWITCH_TO(CDATASectionState);
   1158             } else if (result == SegmentedString::NotEnoughCharacters)
   1159                 return haveBufferedCharacterToken();
   1160         }
   1161         parseError();
   1162         RECONSUME_IN(BogusCommentState);
   1163     }
   1164     END_STATE()
   1165 
   1166     BEGIN_STATE(CommentStartState) {
   1167         if (cc == '-')
   1168             ADVANCE_TO(CommentStartDashState);
   1169         else if (cc == '>') {
   1170             parseError();
   1171             return emitAndResumeIn(source, DataState);
   1172         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1173             parseError();
   1174             return emitAndReconsumeIn(source, DataState);
   1175         } else {
   1176             m_token->appendToComment(cc);
   1177             ADVANCE_TO(CommentState);
   1178         }
   1179     }
   1180     END_STATE()
   1181 
   1182     BEGIN_STATE(CommentStartDashState) {
   1183         if (cc == '-')
   1184             ADVANCE_TO(CommentEndState);
   1185         else if (cc == '>') {
   1186             parseError();
   1187             return emitAndResumeIn(source, DataState);
   1188         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1189             parseError();
   1190             return emitAndReconsumeIn(source, DataState);
   1191         } else {
   1192             m_token->appendToComment('-');
   1193             m_token->appendToComment(cc);
   1194             ADVANCE_TO(CommentState);
   1195         }
   1196     }
   1197     END_STATE()
   1198 
   1199     BEGIN_STATE(CommentState) {
   1200         if (cc == '-')
   1201             ADVANCE_TO(CommentEndDashState);
   1202         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1203             parseError();
   1204             return emitAndReconsumeIn(source, DataState);
   1205         } else {
   1206             m_token->appendToComment(cc);
   1207             ADVANCE_TO(CommentState);
   1208         }
   1209     }
   1210     END_STATE()
   1211 
   1212     BEGIN_STATE(CommentEndDashState) {
   1213         if (cc == '-')
   1214             ADVANCE_TO(CommentEndState);
   1215         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1216             parseError();
   1217             return emitAndReconsumeIn(source, DataState);
   1218         } else {
   1219             m_token->appendToComment('-');
   1220             m_token->appendToComment(cc);
   1221             ADVANCE_TO(CommentState);
   1222         }
   1223     }
   1224     END_STATE()
   1225 
   1226     BEGIN_STATE(CommentEndState) {
   1227         if (cc == '>')
   1228             return emitAndResumeIn(source, DataState);
   1229         else if (cc == '!') {
   1230             parseError();
   1231             ADVANCE_TO(CommentEndBangState);
   1232         } else if (cc == '-') {
   1233             parseError();
   1234             m_token->appendToComment('-');
   1235             ADVANCE_TO(CommentEndState);
   1236         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1237             parseError();
   1238             return emitAndReconsumeIn(source, DataState);
   1239         } else {
   1240             parseError();
   1241             m_token->appendToComment('-');
   1242             m_token->appendToComment('-');
   1243             m_token->appendToComment(cc);
   1244             ADVANCE_TO(CommentState);
   1245         }
   1246     }
   1247     END_STATE()
   1248 
   1249     BEGIN_STATE(CommentEndBangState) {
   1250         if (cc == '-') {
   1251             m_token->appendToComment('-');
   1252             m_token->appendToComment('-');
   1253             m_token->appendToComment('!');
   1254             ADVANCE_TO(CommentEndDashState);
   1255         } else if (cc == '>')
   1256             return emitAndResumeIn(source, DataState);
   1257         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1258             parseError();
   1259             return emitAndReconsumeIn(source, DataState);
   1260         } else {
   1261             m_token->appendToComment('-');
   1262             m_token->appendToComment('-');
   1263             m_token->appendToComment('!');
   1264             m_token->appendToComment(cc);
   1265             ADVANCE_TO(CommentState);
   1266         }
   1267     }
   1268     END_STATE()
   1269 
   1270     BEGIN_STATE(DOCTYPEState) {
   1271         if (isTokenizerWhitespace(cc))
   1272             ADVANCE_TO(BeforeDOCTYPENameState);
   1273         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1274             parseError();
   1275             m_token->beginDOCTYPE();
   1276             m_token->setForceQuirks();
   1277             return emitAndReconsumeIn(source, DataState);
   1278         } else {
   1279             parseError();
   1280             RECONSUME_IN(BeforeDOCTYPENameState);
   1281         }
   1282     }
   1283     END_STATE()
   1284 
   1285     BEGIN_STATE(BeforeDOCTYPENameState) {
   1286         if (isTokenizerWhitespace(cc))
   1287             ADVANCE_TO(BeforeDOCTYPENameState);
   1288         else if (isASCIIUpper(cc)) {
   1289             m_token->beginDOCTYPE(toLowerCase(cc));
   1290             ADVANCE_TO(DOCTYPENameState);
   1291         } else if (cc == '>') {
   1292             parseError();
   1293             m_token->beginDOCTYPE();
   1294             m_token->setForceQuirks();
   1295             return emitAndResumeIn(source, DataState);
   1296         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1297             parseError();
   1298             m_token->beginDOCTYPE();
   1299             m_token->setForceQuirks();
   1300             return emitAndReconsumeIn(source, DataState);
   1301         } else {
   1302             m_token->beginDOCTYPE(cc);
   1303             ADVANCE_TO(DOCTYPENameState);
   1304         }
   1305     }
   1306     END_STATE()
   1307 
   1308     BEGIN_STATE(DOCTYPENameState) {
   1309         if (isTokenizerWhitespace(cc))
   1310             ADVANCE_TO(AfterDOCTYPENameState);
   1311         else if (cc == '>')
   1312             return emitAndResumeIn(source, DataState);
   1313         else if (isASCIIUpper(cc)) {
   1314             m_token->appendToName(toLowerCase(cc));
   1315             ADVANCE_TO(DOCTYPENameState);
   1316         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1317             parseError();
   1318             m_token->setForceQuirks();
   1319             return emitAndReconsumeIn(source, DataState);
   1320         } else {
   1321             m_token->appendToName(cc);
   1322             ADVANCE_TO(DOCTYPENameState);
   1323         }
   1324     }
   1325     END_STATE()
   1326 
   1327     BEGIN_STATE(AfterDOCTYPENameState) {
   1328         if (isTokenizerWhitespace(cc))
   1329             ADVANCE_TO(AfterDOCTYPENameState);
   1330         if (cc == '>')
   1331             return emitAndResumeIn(source, DataState);
   1332         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1333             parseError();
   1334             m_token->setForceQuirks();
   1335             return emitAndReconsumeIn(source, DataState);
   1336         } else {
   1337             DEFINE_STATIC_LOCAL(String, publicString, ("public"));
   1338             DEFINE_STATIC_LOCAL(String, systemString, ("system"));
   1339             if (cc == 'P' || cc == 'p') {
   1340                 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString);
   1341                 if (result == SegmentedString::DidMatch) {
   1342                     advanceStringAndASSERTIgnoringCase(source, "public");
   1343                     SWITCH_TO(AfterDOCTYPEPublicKeywordState);
   1344                 } else if (result == SegmentedString::NotEnoughCharacters)
   1345                     return haveBufferedCharacterToken();
   1346             } else if (cc == 'S' || cc == 's') {
   1347                 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString);
   1348                 if (result == SegmentedString::DidMatch) {
   1349                     advanceStringAndASSERTIgnoringCase(source, "system");
   1350                     SWITCH_TO(AfterDOCTYPESystemKeywordState);
   1351                 } else if (result == SegmentedString::NotEnoughCharacters)
   1352                     return haveBufferedCharacterToken();
   1353             }
   1354             parseError();
   1355             m_token->setForceQuirks();
   1356             ADVANCE_TO(BogusDOCTYPEState);
   1357         }
   1358     }
   1359     END_STATE()
   1360 
   1361     BEGIN_STATE(AfterDOCTYPEPublicKeywordState) {
   1362         if (isTokenizerWhitespace(cc))
   1363             ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
   1364         else if (cc == '"') {
   1365             parseError();
   1366             m_token->setPublicIdentifierToEmptyString();
   1367             ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
   1368         } else if (cc == '\'') {
   1369             parseError();
   1370             m_token->setPublicIdentifierToEmptyString();
   1371             ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
   1372         } else if (cc == '>') {
   1373             parseError();
   1374             m_token->setForceQuirks();
   1375             return emitAndResumeIn(source, DataState);
   1376         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1377             parseError();
   1378             m_token->setForceQuirks();
   1379             return emitAndReconsumeIn(source, DataState);
   1380         } else {
   1381             parseError();
   1382             m_token->setForceQuirks();
   1383             ADVANCE_TO(BogusDOCTYPEState);
   1384         }
   1385     }
   1386     END_STATE()
   1387 
   1388     BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) {
   1389         if (isTokenizerWhitespace(cc))
   1390             ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState);
   1391         else if (cc == '"') {
   1392             m_token->setPublicIdentifierToEmptyString();
   1393             ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
   1394         } else if (cc == '\'') {
   1395             m_token->setPublicIdentifierToEmptyString();
   1396             ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
   1397         } else if (cc == '>') {
   1398             parseError();
   1399             m_token->setForceQuirks();
   1400             return emitAndResumeIn(source, DataState);
   1401         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1402             parseError();
   1403             m_token->setForceQuirks();
   1404             return emitAndReconsumeIn(source, DataState);
   1405         } else {
   1406             parseError();
   1407             m_token->setForceQuirks();
   1408             ADVANCE_TO(BogusDOCTYPEState);
   1409         }
   1410     }
   1411     END_STATE()
   1412 
   1413     BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) {
   1414         if (cc == '"')
   1415             ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
   1416         else if (cc == '>') {
   1417             parseError();
   1418             m_token->setForceQuirks();
   1419             return emitAndResumeIn(source, DataState);
   1420         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1421             parseError();
   1422             m_token->setForceQuirks();
   1423             return emitAndReconsumeIn(source, DataState);
   1424         } else {
   1425             m_token->appendToPublicIdentifier(cc);
   1426             ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState);
   1427         }
   1428     }
   1429     END_STATE()
   1430 
   1431     BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) {
   1432         if (cc == '\'')
   1433             ADVANCE_TO(AfterDOCTYPEPublicIdentifierState);
   1434         else if (cc == '>') {
   1435             parseError();
   1436             m_token->setForceQuirks();
   1437             return emitAndResumeIn(source, DataState);
   1438         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1439             parseError();
   1440             m_token->setForceQuirks();
   1441             return emitAndReconsumeIn(source, DataState);
   1442         } else {
   1443             m_token->appendToPublicIdentifier(cc);
   1444             ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState);
   1445         }
   1446     }
   1447     END_STATE()
   1448 
   1449     BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) {
   1450         if (isTokenizerWhitespace(cc))
   1451             ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
   1452         else if (cc == '>')
   1453             return emitAndResumeIn(source, DataState);
   1454         else if (cc == '"') {
   1455             parseError();
   1456             m_token->setSystemIdentifierToEmptyString();
   1457             ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
   1458         } else if (cc == '\'') {
   1459             parseError();
   1460             m_token->setSystemIdentifierToEmptyString();
   1461             ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
   1462         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1463             parseError();
   1464             m_token->setForceQuirks();
   1465             return emitAndReconsumeIn(source, DataState);
   1466         } else {
   1467             parseError();
   1468             m_token->setForceQuirks();
   1469             ADVANCE_TO(BogusDOCTYPEState);
   1470         }
   1471     }
   1472     END_STATE()
   1473 
   1474     BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) {
   1475         if (isTokenizerWhitespace(cc))
   1476             ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState);
   1477         else if (cc == '>')
   1478             return emitAndResumeIn(source, DataState);
   1479         else if (cc == '"') {
   1480             m_token->setSystemIdentifierToEmptyString();
   1481             ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
   1482         } else if (cc == '\'') {
   1483             m_token->setSystemIdentifierToEmptyString();
   1484             ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
   1485         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1486             parseError();
   1487             m_token->setForceQuirks();
   1488             return emitAndReconsumeIn(source, DataState);
   1489         } else {
   1490             parseError();
   1491             m_token->setForceQuirks();
   1492             ADVANCE_TO(BogusDOCTYPEState);
   1493         }
   1494     }
   1495     END_STATE()
   1496 
   1497     BEGIN_STATE(AfterDOCTYPESystemKeywordState) {
   1498         if (isTokenizerWhitespace(cc))
   1499             ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
   1500         else if (cc == '"') {
   1501             parseError();
   1502             m_token->setSystemIdentifierToEmptyString();
   1503             ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
   1504         } else if (cc == '\'') {
   1505             parseError();
   1506             m_token->setSystemIdentifierToEmptyString();
   1507             ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
   1508         } else if (cc == '>') {
   1509             parseError();
   1510             m_token->setForceQuirks();
   1511             return emitAndResumeIn(source, DataState);
   1512         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1513             parseError();
   1514             m_token->setForceQuirks();
   1515             return emitAndReconsumeIn(source, DataState);
   1516         } else {
   1517             parseError();
   1518             m_token->setForceQuirks();
   1519             ADVANCE_TO(BogusDOCTYPEState);
   1520         }
   1521     }
   1522     END_STATE()
   1523 
   1524     BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) {
   1525         if (isTokenizerWhitespace(cc))
   1526             ADVANCE_TO(BeforeDOCTYPESystemIdentifierState);
   1527         if (cc == '"') {
   1528             m_token->setSystemIdentifierToEmptyString();
   1529             ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
   1530         } else if (cc == '\'') {
   1531             m_token->setSystemIdentifierToEmptyString();
   1532             ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
   1533         } else if (cc == '>') {
   1534             parseError();
   1535             m_token->setForceQuirks();
   1536             return emitAndResumeIn(source, DataState);
   1537         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1538             parseError();
   1539             m_token->setForceQuirks();
   1540             return emitAndReconsumeIn(source, DataState);
   1541         } else {
   1542             parseError();
   1543             m_token->setForceQuirks();
   1544             ADVANCE_TO(BogusDOCTYPEState);
   1545         }
   1546     }
   1547     END_STATE()
   1548 
   1549     BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) {
   1550         if (cc == '"')
   1551             ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
   1552         else if (cc == '>') {
   1553             parseError();
   1554             m_token->setForceQuirks();
   1555             return emitAndResumeIn(source, DataState);
   1556         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1557             parseError();
   1558             m_token->setForceQuirks();
   1559             return emitAndReconsumeIn(source, DataState);
   1560         } else {
   1561             m_token->appendToSystemIdentifier(cc);
   1562             ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState);
   1563         }
   1564     }
   1565     END_STATE()
   1566 
   1567     BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) {
   1568         if (cc == '\'')
   1569             ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
   1570         else if (cc == '>') {
   1571             parseError();
   1572             m_token->setForceQuirks();
   1573             return emitAndResumeIn(source, DataState);
   1574         } else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1575             parseError();
   1576             m_token->setForceQuirks();
   1577             return emitAndReconsumeIn(source, DataState);
   1578         } else {
   1579             m_token->appendToSystemIdentifier(cc);
   1580             ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState);
   1581         }
   1582     }
   1583     END_STATE()
   1584 
   1585     BEGIN_STATE(AfterDOCTYPESystemIdentifierState) {
   1586         if (isTokenizerWhitespace(cc))
   1587             ADVANCE_TO(AfterDOCTYPESystemIdentifierState);
   1588         else if (cc == '>')
   1589             return emitAndResumeIn(source, DataState);
   1590         else if (cc == InputStreamPreprocessor::endOfFileMarker) {
   1591             parseError();
   1592             m_token->setForceQuirks();
   1593             return emitAndReconsumeIn(source, DataState);
   1594         } else {
   1595             parseError();
   1596             ADVANCE_TO(BogusDOCTYPEState);
   1597         }
   1598     }
   1599     END_STATE()
   1600 
   1601     BEGIN_STATE(BogusDOCTYPEState) {
   1602         if (cc == '>')
   1603             return emitAndResumeIn(source, DataState);
   1604         else if (cc == InputStreamPreprocessor::endOfFileMarker)
   1605             return emitAndReconsumeIn(source, DataState);
   1606         ADVANCE_TO(BogusDOCTYPEState);
   1607     }
   1608     END_STATE()
   1609 
   1610     BEGIN_STATE(CDATASectionState) {
   1611         if (cc == ']')
   1612             ADVANCE_TO(CDATASectionRightSquareBracketState);
   1613         else if (cc == InputStreamPreprocessor::endOfFileMarker)
   1614             RECONSUME_IN(DataState);
   1615         else {
   1616             bufferCharacter(cc);
   1617             ADVANCE_TO(CDATASectionState);
   1618         }
   1619     }
   1620     END_STATE()
   1621 
   1622     BEGIN_STATE(CDATASectionRightSquareBracketState) {
   1623         if (cc == ']')
   1624             ADVANCE_TO(CDATASectionDoubleRightSquareBracketState);
   1625         else {
   1626             bufferCharacter(']');
   1627             RECONSUME_IN(CDATASectionState);
   1628         }
   1629     }
   1630 
   1631     BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) {
   1632         if (cc == '>')
   1633             ADVANCE_TO(DataState);
   1634         else {
   1635             bufferCharacter(']');
   1636             bufferCharacter(']');
   1637             RECONSUME_IN(CDATASectionState);
   1638         }
   1639     }
   1640     END_STATE()
   1641 
   1642     }
   1643 
   1644     ASSERT_NOT_REACHED();
   1645     return false;
   1646 }
   1647 
   1648 void HTMLTokenizer::updateStateFor(const AtomicString& tagName, Frame* frame)
   1649 {
   1650     if (tagName == textareaTag || tagName == titleTag)
   1651         setState(RCDATAState);
   1652     else if (tagName == plaintextTag)
   1653         setState(PLAINTEXTState);
   1654     else if (tagName == scriptTag)
   1655         setState(ScriptDataState);
   1656     else if (tagName == styleTag
   1657         || tagName == iframeTag
   1658         || tagName == xmpTag
   1659         || (tagName == noembedTag && HTMLTreeBuilder::pluginsEnabled(frame))
   1660         || tagName == noframesTag
   1661         || (tagName == noscriptTag && HTMLTreeBuilder::scriptEnabled(frame)))
   1662         setState(RAWTEXTState);
   1663 }
   1664 
   1665 inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString)
   1666 {
   1667     return vectorEqualsString(m_temporaryBuffer, expectedString);
   1668 }
   1669 
   1670 inline void HTMLTokenizer::addToPossibleEndTag(UChar cc)
   1671 {
   1672     ASSERT(isEndTagBufferingState(m_state));
   1673     m_bufferedEndTagName.append(cc);
   1674 }
   1675 
   1676 inline bool HTMLTokenizer::isAppropriateEndTag()
   1677 {
   1678     return m_bufferedEndTagName == m_appropriateEndTagName;
   1679 }
   1680 
   1681 inline void HTMLTokenizer::bufferCharacter(UChar character)
   1682 {
   1683     ASSERT(character != InputStreamPreprocessor::endOfFileMarker);
   1684     m_token->ensureIsCharacterToken();
   1685     m_token->appendToCharacter(character);
   1686 }
   1687 
   1688 inline void HTMLTokenizer::parseError()
   1689 {
   1690     notImplemented();
   1691 }
   1692 
   1693 inline bool HTMLTokenizer::haveBufferedCharacterToken()
   1694 {
   1695     return m_token->type() == HTMLToken::Character;
   1696 }
   1697 
   1698 }
   1699