Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
      3  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 #ifndef HTMLTokenizer_h
     28 #define HTMLTokenizer_h
     29 
     30 #include "SegmentedString.h"
     31 #include <wtf/Noncopyable.h>
     32 #include <wtf/PassOwnPtr.h>
     33 #include <wtf/Vector.h>
     34 #include <wtf/text/AtomicString.h>
     35 
     36 namespace WebCore {
     37 
     38 class Element;
     39 class Frame;
     40 class HTMLToken;
     41 
     42 class HTMLTokenizer {
     43     WTF_MAKE_NONCOPYABLE(HTMLTokenizer); WTF_MAKE_FAST_ALLOCATED;
     44 public:
     45     enum State {
     46         DataState,
     47         CharacterReferenceInDataState,
     48         RCDATAState,
     49         CharacterReferenceInRCDATAState,
     50         RAWTEXTState,
     51         ScriptDataState,
     52         PLAINTEXTState,
     53         TagOpenState,
     54         EndTagOpenState,
     55         TagNameState,
     56         RCDATALessThanSignState,
     57         RCDATAEndTagOpenState,
     58         RCDATAEndTagNameState,
     59         RAWTEXTLessThanSignState,
     60         RAWTEXTEndTagOpenState,
     61         RAWTEXTEndTagNameState,
     62         ScriptDataLessThanSignState,
     63         ScriptDataEndTagOpenState,
     64         ScriptDataEndTagNameState,
     65         ScriptDataEscapeStartState,
     66         ScriptDataEscapeStartDashState,
     67         ScriptDataEscapedState,
     68         ScriptDataEscapedDashState,
     69         ScriptDataEscapedDashDashState,
     70         ScriptDataEscapedLessThanSignState,
     71         ScriptDataEscapedEndTagOpenState,
     72         ScriptDataEscapedEndTagNameState,
     73         ScriptDataDoubleEscapeStartState,
     74         ScriptDataDoubleEscapedState,
     75         ScriptDataDoubleEscapedDashState,
     76         ScriptDataDoubleEscapedDashDashState,
     77         ScriptDataDoubleEscapedLessThanSignState,
     78         ScriptDataDoubleEscapeEndState,
     79         BeforeAttributeNameState,
     80         AttributeNameState,
     81         AfterAttributeNameState,
     82         BeforeAttributeValueState,
     83         AttributeValueDoubleQuotedState,
     84         AttributeValueSingleQuotedState,
     85         AttributeValueUnquotedState,
     86         CharacterReferenceInAttributeValueState,
     87         AfterAttributeValueQuotedState,
     88         SelfClosingStartTagState,
     89         BogusCommentState,
     90         // The ContinueBogusCommentState is not in the HTML5 spec, but we use
     91         // it internally to keep track of whether we've started the bogus
     92         // comment token yet.
     93         ContinueBogusCommentState,
     94         MarkupDeclarationOpenState,
     95         CommentStartState,
     96         CommentStartDashState,
     97         CommentState,
     98         CommentEndDashState,
     99         CommentEndState,
    100         CommentEndBangState,
    101         DOCTYPEState,
    102         BeforeDOCTYPENameState,
    103         DOCTYPENameState,
    104         AfterDOCTYPENameState,
    105         AfterDOCTYPEPublicKeywordState,
    106         BeforeDOCTYPEPublicIdentifierState,
    107         DOCTYPEPublicIdentifierDoubleQuotedState,
    108         DOCTYPEPublicIdentifierSingleQuotedState,
    109         AfterDOCTYPEPublicIdentifierState,
    110         BetweenDOCTYPEPublicAndSystemIdentifiersState,
    111         AfterDOCTYPESystemKeywordState,
    112         BeforeDOCTYPESystemIdentifierState,
    113         DOCTYPESystemIdentifierDoubleQuotedState,
    114         DOCTYPESystemIdentifierSingleQuotedState,
    115         AfterDOCTYPESystemIdentifierState,
    116         BogusDOCTYPEState,
    117         CDATASectionState,
    118         // These CDATA states are not in the HTML5 spec, but we use them internally.
    119         CDATASectionRightSquareBracketState,
    120         CDATASectionDoubleRightSquareBracketState,
    121     };
    122 
    123     static PassOwnPtr<HTMLTokenizer> create(bool usePreHTML5ParserQuirks) { return adoptPtr(new HTMLTokenizer(usePreHTML5ParserQuirks)); }
    124     ~HTMLTokenizer();
    125 
    126     void reset();
    127 
    128     // This function returns true if it emits a token. Otherwise, callers
    129     // must provide the same (in progress) token on the next call (unless
    130     // they call reset() first).
    131     bool nextToken(SegmentedString&, HTMLToken&);
    132 
    133     int lineNumber() const { return m_lineNumber; }
    134     int columnNumber() const { return 1; } // Matches LegacyHTMLDocumentParser.h behavior.
    135 
    136     State state() const { return m_state; }
    137     void setState(State state) { m_state = state; }
    138 
    139     // Updates the tokenizer's state according to the given tag name. This is
    140     // an approximation of how the tree builder would update the tokenizer's
    141     // state. This method is useful for approximating HTML tokenization. To
    142     // get exactly the correct tokenization, you need the real tree builder.
    143     //
    144     // The main failures in the approximation are as follows:
    145     //
    146     //  * The first set of character tokens emitted for a <pre> element might
    147     //    contain an extra leading newline.
    148     //  * The replacement of U+0000 with U+FFFD will not be sensitive to the
    149     //    tree builder's insertion mode.
    150     //  * CDATA sections in foreign content will be tokenized as bogus comments
    151     //    instead of as character tokens.
    152     //
    153     void updateStateFor(const AtomicString& tagName, Frame*);
    154 
    155     // Hack to skip leading newline in <pre>/<listing> for authoring ease.
    156     // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody
    157     void setSkipLeadingNewLineForListing(bool value) { m_skipLeadingNewLineForListing = value; }
    158 
    159     bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; }
    160     void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; }
    161 
    162     bool shouldAllowCDATA() const { return m_shouldAllowCDATA; }
    163     void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; }
    164 
    165     bool shouldSkipNullCharacters() const
    166     {
    167         return !m_forceNullCharacterReplacement
    168             && (m_state == DataState
    169                 || m_state == RCDATAState
    170                 || m_state == RAWTEXTState
    171                 || m_state == PLAINTEXTState);
    172     }
    173 
    174 private:
    175     // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
    176     class InputStreamPreprocessor {
    177         WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor);
    178     public:
    179         InputStreamPreprocessor(HTMLTokenizer* tokenizer)
    180             : m_tokenizer(tokenizer)
    181             , m_nextInputCharacter('\0')
    182             , m_skipNextNewLine(false)
    183         {
    184         }
    185 
    186         UChar nextInputCharacter() const { return m_nextInputCharacter; }
    187 
    188         // Returns whether we succeeded in peeking at the next character.
    189         // The only way we can fail to peek is if there are no more
    190         // characters in |source| (after collapsing \r\n, etc).
    191         ALWAYS_INLINE bool peek(SegmentedString& source, int& lineNumber)
    192         {
    193         PeekAgain:
    194             m_nextInputCharacter = *source;
    195 
    196             // Every branch in this function is expensive, so we have a
    197             // fast-reject branch for characters that don't require special
    198             // handling. Please run the parser benchmark whenever you touch
    199             // this function. It's very hot.
    200             static const UChar specialCharacterMask = '\n' | '\r' | '\0';
    201             if (m_nextInputCharacter & ~specialCharacterMask) {
    202                 m_skipNextNewLine = false;
    203                 return true;
    204             }
    205 
    206             if (m_nextInputCharacter == '\n' && m_skipNextNewLine) {
    207                 m_skipNextNewLine = false;
    208                 source.advancePastNewline(lineNumber);
    209                 if (source.isEmpty())
    210                     return false;
    211                 m_nextInputCharacter = *source;
    212             }
    213             if (m_nextInputCharacter == '\r') {
    214                 m_nextInputCharacter = '\n';
    215                 m_skipNextNewLine = true;
    216             } else {
    217                 m_skipNextNewLine = false;
    218                 // FIXME: The spec indicates that the surrogate pair range as well as
    219                 // a number of specific character values are parse errors and should be replaced
    220                 // by the replacement character. We suspect this is a problem with the spec as doing
    221                 // that filtering breaks surrogate pair handling and causes us not to match Minefield.
    222                 if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) {
    223                     if (m_tokenizer->shouldSkipNullCharacters()) {
    224                         source.advancePastNonNewline();
    225                         if (source.isEmpty())
    226                             return false;
    227                         goto PeekAgain;
    228                     }
    229                     m_nextInputCharacter = 0xFFFD;
    230                 }
    231             }
    232             return true;
    233         }
    234 
    235         // Returns whether there are more characters in |source| after advancing.
    236         bool advance(SegmentedString& source, int& lineNumber)
    237         {
    238             source.advance(lineNumber);
    239             if (source.isEmpty())
    240                 return false;
    241             return peek(source, lineNumber);
    242         }
    243 
    244         static const UChar endOfFileMarker;
    245 
    246     private:
    247         bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const
    248         {
    249             return source.isClosed() && source.length() == 1;
    250         }
    251 
    252         HTMLTokenizer* m_tokenizer;
    253 
    254         // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character
    255         UChar m_nextInputCharacter;
    256         bool m_skipNextNewLine;
    257     };
    258 
    259     HTMLTokenizer(bool usePreHTML5ParserQuirks);
    260 
    261     inline bool processEntity(SegmentedString&);
    262 
    263     inline void parseError();
    264     inline void bufferCharacter(UChar);
    265     inline void bufferCodePoint(unsigned);
    266 
    267     inline bool emitAndResumeIn(SegmentedString&, State);
    268     inline bool emitAndReconsumeIn(SegmentedString&, State);
    269     inline bool emitEndOfFile(SegmentedString&);
    270     inline bool flushEmitAndResumeIn(SegmentedString&, State);
    271 
    272     // Return whether we need to emit a character token before dealing with
    273     // the buffered end tag.
    274     inline bool flushBufferedEndTag(SegmentedString&);
    275     inline bool temporaryBufferIs(const String&);
    276 
    277     // Sometimes we speculatively consume input characters and we don't
    278     // know whether they represent end tags or RCDATA, etc. These
    279     // functions help manage these state.
    280     inline void addToPossibleEndTag(UChar cc);
    281     inline void saveEndTagNameIfNeeded();
    282     inline bool isAppropriateEndTag();
    283 
    284     inline bool haveBufferedCharacterToken();
    285 
    286     State m_state;
    287 
    288     Vector<UChar, 32> m_appropriateEndTagName;
    289 
    290     // m_token is owned by the caller. If nextToken is not on the stack,
    291     // this member might be pointing to unallocated memory.
    292     HTMLToken* m_token;
    293     int m_lineNumber;
    294 
    295     bool m_skipLeadingNewLineForListing;
    296     bool m_forceNullCharacterReplacement;
    297     bool m_shouldAllowCDATA;
    298 
    299     // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer
    300     Vector<UChar, 32> m_temporaryBuffer;
    301 
    302     // We occationally want to emit both a character token and an end tag
    303     // token (e.g., when lexing script). We buffer the name of the end tag
    304     // token here so we remember it next time we re-enter the tokenizer.
    305     Vector<UChar, 32> m_bufferedEndTagName;
    306 
    307     // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character
    308     UChar m_additionalAllowedCharacter;
    309 
    310     // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream
    311     InputStreamPreprocessor m_inputStreamPreprocessor;
    312 
    313     bool m_usePreHTML5ParserQuirks;
    314 };
    315 
    316 }
    317 
    318 #endif
    319