Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
      3  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 #ifndef HTMLTokenizer_h
     28 #define HTMLTokenizer_h
     29 
     30 #include "core/html/parser/HTMLParserOptions.h"
     31 #include "core/html/parser/HTMLToken.h"
     32 #include "core/html/parser/InputStreamPreprocessor.h"
     33 #include "platform/text/SegmentedString.h"
     34 
     35 namespace blink {
     36 
     37 class HTMLTokenizer {
     38     WTF_MAKE_NONCOPYABLE(HTMLTokenizer);
     39     WTF_MAKE_FAST_ALLOCATED;
     40 public:
     41     static PassOwnPtr<HTMLTokenizer> create(const HTMLParserOptions& options) { return adoptPtr(new HTMLTokenizer(options)); }
     42     ~HTMLTokenizer();
     43 
     44     void reset();
     45 
     46     enum State {
     47         DataState,
     48         CharacterReferenceInDataState,
     49         RCDATAState,
     50         CharacterReferenceInRCDATAState,
     51         RAWTEXTState,
     52         ScriptDataState,
     53         PLAINTEXTState,
     54         TagOpenState,
     55         EndTagOpenState,
     56         TagNameState,
     57         RCDATALessThanSignState,
     58         RCDATAEndTagOpenState,
     59         RCDATAEndTagNameState,
     60         RAWTEXTLessThanSignState,
     61         RAWTEXTEndTagOpenState,
     62         RAWTEXTEndTagNameState,
     63         ScriptDataLessThanSignState,
     64         ScriptDataEndTagOpenState,
     65         ScriptDataEndTagNameState,
     66         ScriptDataEscapeStartState,
     67         ScriptDataEscapeStartDashState,
     68         ScriptDataEscapedState,
     69         ScriptDataEscapedDashState,
     70         ScriptDataEscapedDashDashState,
     71         ScriptDataEscapedLessThanSignState,
     72         ScriptDataEscapedEndTagOpenState,
     73         ScriptDataEscapedEndTagNameState,
     74         ScriptDataDoubleEscapeStartState,
     75         ScriptDataDoubleEscapedState,
     76         ScriptDataDoubleEscapedDashState,
     77         ScriptDataDoubleEscapedDashDashState,
     78         ScriptDataDoubleEscapedLessThanSignState,
     79         ScriptDataDoubleEscapeEndState,
     80         BeforeAttributeNameState,
     81         AttributeNameState,
     82         AfterAttributeNameState,
     83         BeforeAttributeValueState,
     84         AttributeValueDoubleQuotedState,
     85         AttributeValueSingleQuotedState,
     86         AttributeValueUnquotedState,
     87         CharacterReferenceInAttributeValueState,
     88         AfterAttributeValueQuotedState,
     89         SelfClosingStartTagState,
     90         BogusCommentState,
     91         // The ContinueBogusCommentState is not in the HTML5 spec, but we use
     92         // it internally to keep track of whether we've started the bogus
     93         // comment token yet.
     94         ContinueBogusCommentState,
     95         MarkupDeclarationOpenState,
     96         CommentStartState,
     97         CommentStartDashState,
     98         CommentState,
     99         CommentEndDashState,
    100         CommentEndState,
    101         CommentEndBangState,
    102         DOCTYPEState,
    103         BeforeDOCTYPENameState,
    104         DOCTYPENameState,
    105         AfterDOCTYPENameState,
    106         AfterDOCTYPEPublicKeywordState,
    107         BeforeDOCTYPEPublicIdentifierState,
    108         DOCTYPEPublicIdentifierDoubleQuotedState,
    109         DOCTYPEPublicIdentifierSingleQuotedState,
    110         AfterDOCTYPEPublicIdentifierState,
    111         BetweenDOCTYPEPublicAndSystemIdentifiersState,
    112         AfterDOCTYPESystemKeywordState,
    113         BeforeDOCTYPESystemIdentifierState,
    114         DOCTYPESystemIdentifierDoubleQuotedState,
    115         DOCTYPESystemIdentifierSingleQuotedState,
    116         AfterDOCTYPESystemIdentifierState,
    117         BogusDOCTYPEState,
    118         CDATASectionState,
    119         // These CDATA states are not in the HTML5 spec, but we use them internally.
    120         CDATASectionRightSquareBracketState,
    121         CDATASectionDoubleRightSquareBracketState,
    122     };
    123 
    124     struct Checkpoint {
    125         HTMLParserOptions options;
    126         State state;
    127         UChar additionalAllowedCharacter;
    128         bool skipNextNewLine;
    129         bool shouldAllowCDATA;
    130 
    131         Checkpoint()
    132             : options(0)
    133             , state()
    134             , additionalAllowedCharacter('\0')
    135             , skipNextNewLine(false)
    136             , shouldAllowCDATA(false)
    137         {
    138         }
    139     };
    140 
    141     bool canCreateCheckpoint() const;
    142     void createCheckpoint(Checkpoint&) const;
    143     void restoreFromCheckpoint(const Checkpoint&);
    144 
    145     // This function returns true if it emits a token. Otherwise, callers
    146     // must provide the same (in progress) token on the next call (unless
    147     // they call reset() first).
    148     bool nextToken(SegmentedString&, HTMLToken&);
    149 
    150     // Returns a copy of any characters buffered internally by the tokenizer.
    151     // The tokenizer buffers characters when searching for the </script> token
    152     // that terminates a script element.
    153     String bufferedCharacters() const;
    154 
    155     size_t numberOfBufferedCharacters() const
    156     {
    157         // Notice that we add 2 to the length of the m_temporaryBuffer to
    158         // account for the "</" characters, which are effecitvely buffered in
    159         // the tokenizer's state machine.
    160         return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0;
    161     }
    162 
    163     // Updates the tokenizer's state according to the given tag name. This is
    164     // an approximation of how the tree builder would update the tokenizer's
    165     // state. This method is useful for approximating HTML tokenization. To
    166     // get exactly the correct tokenization, you need the real tree builder.
    167     //
    168     // The main failures in the approximation are as follows:
    169     //
    170     //  * The first set of character tokens emitted for a <pre> element might
    171     //    contain an extra leading newline.
    172     //  * The replacement of U+0000 with U+FFFD will not be sensitive to the
    173     //    tree builder's insertion mode.
    174     //  * CDATA sections in foreign content will be tokenized as bogus comments
    175     //    instead of as character tokens.
    176     //
    177     void updateStateFor(const String& tagName);
    178 
    179     bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
    180     void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
    181 
    182     bool shouldAllowCDATA() const { return m_shouldAllowCDATA; }
    183     void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; }
    184 
    185     State state() const { return m_state; }
    186     void setState(State state) { m_state = state; }
    187 
    188     inline bool shouldSkipNullCharacters() const
    189     {
    190         return !m_forceNullCharacterReplacement
    191             && (m_state == HTMLTokenizer::DataState
    192                 || m_state == HTMLTokenizer::RCDATAState
    193                 || m_state == HTMLTokenizer::RAWTEXTState);
    194     }
    195 
    196 private:
    197     explicit HTMLTokenizer(const HTMLParserOptions&);
    198 
    199     inline bool processEntity(SegmentedString&);
    200 
    201     inline void parseError();
    202 
    203     inline void bufferCharacter(UChar character)
    204     {
    205         ASSERT(character != kEndOfFileMarker);
    206         m_token->ensureIsCharacterToken();
    207         m_token->appendToCharacter(character);
    208     }
    209 
    210     inline bool emitAndResumeIn(SegmentedString& source, State state)
    211     {
    212         saveEndTagNameIfNeeded();
    213         m_state = state;
    214         source.advanceAndUpdateLineNumber();
    215         return true;
    216     }
    217 
    218     inline bool emitAndReconsumeIn(SegmentedString&, State state)
    219     {
    220         saveEndTagNameIfNeeded();
    221         m_state = state;
    222         return true;
    223     }
    224 
    225     inline bool emitEndOfFile(SegmentedString& source)
    226     {
    227         if (haveBufferedCharacterToken())
    228             return true;
    229         m_state = HTMLTokenizer::DataState;
    230         source.advanceAndUpdateLineNumber();
    231         m_token->clear();
    232         m_token->makeEndOfFile();
    233         return true;
    234     }
    235 
    236     inline bool flushEmitAndResumeIn(SegmentedString&, State);
    237 
    238     // Return whether we need to emit a character token before dealing with
    239     // the buffered end tag.
    240     inline bool flushBufferedEndTag(SegmentedString&);
    241     inline bool temporaryBufferIs(const String&);
    242 
    243     // Sometimes we speculatively consume input characters and we don't
    244     // know whether they represent end tags or RCDATA, etc. These
    245     // functions help manage these state.
    246     inline void addToPossibleEndTag(LChar cc);
    247 
    248     inline void saveEndTagNameIfNeeded()
    249     {
    250         ASSERT(m_token->type() != HTMLToken::Uninitialized);
    251         if (m_token->type() == HTMLToken::StartTag)
    252             m_appropriateEndTagName = m_token->name();
    253     }
    254     inline bool isAppropriateEndTag();
    255 
    256 
    257     inline bool haveBufferedCharacterToken()
    258     {
    259         return m_token->type() == HTMLToken::Character;
    260     }
    261 
    262     State m_state;
    263     bool m_forceNullCharacterReplacement;
    264     bool m_shouldAllowCDATA;
    265 
    266     // m_token is owned by the caller. If nextToken is not on the stack,
    267     // this member might be pointing to unallocated memory.
    268     HTMLToken* m_token;
    269 
    270     // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
    271     UChar m_additionalAllowedCharacter;
    272 
    273     // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
    274     InputStreamPreprocessor<HTMLTokenizer> m_inputStreamPreprocessor;
    275 
    276     Vector<UChar, 32> m_appropriateEndTagName;
    277 
    278     // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
    279     Vector<LChar, 32> m_temporaryBuffer;
    280 
    281     // We occationally want to emit both a character token and an end tag
    282     // token (e.g., when lexing script). We buffer the name of the end tag
    283     // token here so we remember it next time we re-enter the tokenizer.
    284     Vector<LChar, 32> m_bufferedEndTagName;
    285 
    286     HTMLParserOptions m_options;
    287 };
    288 
    289 }
    290 
    291 #endif
    292