Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
      3  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 #ifndef HTMLTokenizer_h
     28 #define HTMLTokenizer_h
     29 
     30 #include "core/html/parser/HTMLParserOptions.h"
     31 #include "core/html/parser/HTMLToken.h"
     32 #include "core/html/parser/InputStreamPreprocessor.h"
     33 #include "platform/text/SegmentedString.h"
     34 
     35 namespace WebCore {
     36 
     37 class HTMLTokenizer {
     38     WTF_MAKE_NONCOPYABLE(HTMLTokenizer);
     39     WTF_MAKE_FAST_ALLOCATED;
     40 public:
     41     static PassOwnPtr<HTMLTokenizer> create(const HTMLParserOptions& options) { return adoptPtr(new HTMLTokenizer(options)); }
     42     ~HTMLTokenizer();
     43 
     44     void reset();
     45 
     46     enum State {
     47         DataState,
     48         CharacterReferenceInDataState,
     49         RCDATAState,
     50         CharacterReferenceInRCDATAState,
     51         RAWTEXTState,
     52         ScriptDataState,
     53         PLAINTEXTState,
     54         TagOpenState,
     55         EndTagOpenState,
     56         TagNameState,
     57         RCDATALessThanSignState,
     58         RCDATAEndTagOpenState,
     59         RCDATAEndTagNameState,
     60         RAWTEXTLessThanSignState,
     61         RAWTEXTEndTagOpenState,
     62         RAWTEXTEndTagNameState,
     63         ScriptDataLessThanSignState,
     64         ScriptDataEndTagOpenState,
     65         ScriptDataEndTagNameState,
     66         ScriptDataEscapeStartState,
     67         ScriptDataEscapeStartDashState,
     68         ScriptDataEscapedState,
     69         ScriptDataEscapedDashState,
     70         ScriptDataEscapedDashDashState,
     71         ScriptDataEscapedLessThanSignState,
     72         ScriptDataEscapedEndTagOpenState,
     73         ScriptDataEscapedEndTagNameState,
     74         ScriptDataDoubleEscapeStartState,
     75         ScriptDataDoubleEscapedState,
     76         ScriptDataDoubleEscapedDashState,
     77         ScriptDataDoubleEscapedDashDashState,
     78         ScriptDataDoubleEscapedLessThanSignState,
     79         ScriptDataDoubleEscapeEndState,
     80         BeforeAttributeNameState,
     81         AttributeNameState,
     82         AfterAttributeNameState,
     83         BeforeAttributeValueState,
     84         AttributeValueDoubleQuotedState,
     85         AttributeValueSingleQuotedState,
     86         AttributeValueUnquotedState,
     87         CharacterReferenceInAttributeValueState,
     88         AfterAttributeValueQuotedState,
     89         SelfClosingStartTagState,
     90         BogusCommentState,
     91         // The ContinueBogusCommentState is not in the HTML5 spec, but we use
     92         // it internally to keep track of whether we've started the bogus
     93         // comment token yet.
     94         ContinueBogusCommentState,
     95         MarkupDeclarationOpenState,
     96         CommentStartState,
     97         CommentStartDashState,
     98         CommentState,
     99         CommentEndDashState,
    100         CommentEndState,
    101         CommentEndBangState,
    102         DOCTYPEState,
    103         BeforeDOCTYPENameState,
    104         DOCTYPENameState,
    105         AfterDOCTYPENameState,
    106         AfterDOCTYPEPublicKeywordState,
    107         BeforeDOCTYPEPublicIdentifierState,
    108         DOCTYPEPublicIdentifierDoubleQuotedState,
    109         DOCTYPEPublicIdentifierSingleQuotedState,
    110         AfterDOCTYPEPublicIdentifierState,
    111         BetweenDOCTYPEPublicAndSystemIdentifiersState,
    112         AfterDOCTYPESystemKeywordState,
    113         BeforeDOCTYPESystemIdentifierState,
    114         DOCTYPESystemIdentifierDoubleQuotedState,
    115         DOCTYPESystemIdentifierSingleQuotedState,
    116         AfterDOCTYPESystemIdentifierState,
    117         BogusDOCTYPEState,
    118         CDATASectionState,
    119         // These CDATA states are not in the HTML5 spec, but we use them internally.
    120         CDATASectionRightSquareBracketState,
    121         CDATASectionDoubleRightSquareBracketState,
    122     };
    123 
    124     struct Checkpoint {
    125         HTMLParserOptions options;
    126         State state;
    127         UChar additionalAllowedCharacter;
    128         bool skipNextNewLine;
    129         bool forceNullCharacterReplacement;
    130         bool shouldAllowCDATA;
    131 
    132         Checkpoint()
    133             : options(0)
    134             , state()
    135             , additionalAllowedCharacter('\0')
    136             , skipNextNewLine(false)
    137             , forceNullCharacterReplacement(false)
    138             , shouldAllowCDATA(false)
    139         {
    140         }
    141     };
    142 
    143     bool canCreateCheckpoint() const;
    144     void createCheckpoint(Checkpoint&) const;
    145     void restoreFromCheckpoint(const Checkpoint&);
    146 
    147     // This function returns true if it emits a token. Otherwise, callers
    148     // must provide the same (in progress) token on the next call (unless
    149     // they call reset() first).
    150     bool nextToken(SegmentedString&, HTMLToken&);
    151 
    152     // Returns a copy of any characters buffered internally by the tokenizer.
    153     // The tokenizer buffers characters when searching for the </script> token
    154     // that terminates a script element.
    155     String bufferedCharacters() const;
    156 
    157     size_t numberOfBufferedCharacters() const
    158     {
    159         // Notice that we add 2 to the length of the m_temporaryBuffer to
    160         // account for the "</" characters, which are effecitvely buffered in
    161         // the tokenizer's state machine.
    162         return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0;
    163     }
    164 
    165     // Updates the tokenizer's state according to the given tag name. This is
    166     // an approximation of how the tree builder would update the tokenizer's
    167     // state. This method is useful for approximating HTML tokenization. To
    168     // get exactly the correct tokenization, you need the real tree builder.
    169     //
    170     // The main failures in the approximation are as follows:
    171     //
    172     //  * The first set of character tokens emitted for a <pre> element might
    173     //    contain an extra leading newline.
    174     //  * The replacement of U+0000 with U+FFFD will not be sensitive to the
    175     //    tree builder's insertion mode.
    176     //  * CDATA sections in foreign content will be tokenized as bogus comments
    177     //    instead of as character tokens.
    178     //
    179     void updateStateFor(const AtomicString& tagName);
    180 
    181     bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
    182     void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
    183 
    184     bool shouldAllowCDATA() const { return m_shouldAllowCDATA; }
    185     void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; }
    186 
    187     State state() const { return m_state; }
    188     void setState(State state) { m_state = state; }
    189 
    190     inline bool shouldSkipNullCharacters() const
    191     {
    192         return !m_forceNullCharacterReplacement
    193             && (m_state == HTMLTokenizer::DataState
    194                 || m_state == HTMLTokenizer::RCDATAState
    195                 || m_state == HTMLTokenizer::RAWTEXTState);
    196     }
    197 
    198 private:
    199     explicit HTMLTokenizer(const HTMLParserOptions&);
    200 
    201     inline bool processEntity(SegmentedString&);
    202 
    203     inline void parseError();
    204 
    205     inline void bufferCharacter(UChar character)
    206     {
    207         ASSERT(character != kEndOfFileMarker);
    208         m_token->ensureIsCharacterToken();
    209         m_token->appendToCharacter(character);
    210     }
    211 
    212     inline bool emitAndResumeIn(SegmentedString& source, State state)
    213     {
    214         saveEndTagNameIfNeeded();
    215         m_state = state;
    216         source.advanceAndUpdateLineNumber();
    217         return true;
    218     }
    219 
    220     inline bool emitAndReconsumeIn(SegmentedString&, State state)
    221     {
    222         saveEndTagNameIfNeeded();
    223         m_state = state;
    224         return true;
    225     }
    226 
    227     inline bool emitEndOfFile(SegmentedString& source)
    228     {
    229         if (haveBufferedCharacterToken())
    230             return true;
    231         m_state = HTMLTokenizer::DataState;
    232         source.advanceAndUpdateLineNumber();
    233         m_token->clear();
    234         m_token->makeEndOfFile();
    235         return true;
    236     }
    237 
    238     inline bool flushEmitAndResumeIn(SegmentedString&, State);
    239 
    240     // Return whether we need to emit a character token before dealing with
    241     // the buffered end tag.
    242     inline bool flushBufferedEndTag(SegmentedString&);
    243     inline bool temporaryBufferIs(const String&);
    244 
    245     // Sometimes we speculatively consume input characters and we don't
    246     // know whether they represent end tags or RCDATA, etc. These
    247     // functions help manage these state.
    248     inline void addToPossibleEndTag(LChar cc);
    249 
    250     inline void saveEndTagNameIfNeeded()
    251     {
    252         ASSERT(m_token->type() != HTMLToken::Uninitialized);
    253         if (m_token->type() == HTMLToken::StartTag)
    254             m_appropriateEndTagName = m_token->name();
    255     }
    256     inline bool isAppropriateEndTag();
    257 
    258 
    259     inline bool haveBufferedCharacterToken()
    260     {
    261         return m_token->type() == HTMLToken::Character;
    262     }
    263 
    264     State m_state;
    265     bool m_forceNullCharacterReplacement;
    266     bool m_shouldAllowCDATA;
    267 
    268     // m_token is owned by the caller. If nextToken is not on the stack,
    269     // this member might be pointing to unallocated memory.
    270     HTMLToken* m_token;
    271 
    272     // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
    273     UChar m_additionalAllowedCharacter;
    274 
    275     // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
    276     InputStreamPreprocessor<HTMLTokenizer> m_inputStreamPreprocessor;
    277 
    278     Vector<UChar, 32> m_appropriateEndTagName;
    279 
    280     // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
    281     Vector<LChar, 32> m_temporaryBuffer;
    282 
    283     // We occationally want to emit both a character token and an end tag
    284     // token (e.g., when lexing script). We buffer the name of the end tag
    285     // token here so we remember it next time we re-enter the tokenizer.
    286     Vector<LChar, 32> m_bufferedEndTagName;
    287 
    288     HTMLParserOptions m_options;
    289 };
    290 
    291 }
    292 
    293 #endif
    294