1 /* 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 3 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #ifndef HTMLTokenizer_h 28 #define HTMLTokenizer_h 29 30 #include "core/html/parser/HTMLParserOptions.h" 31 #include "core/html/parser/HTMLToken.h" 32 #include "core/html/parser/InputStreamPreprocessor.h" 33 #include "platform/text/SegmentedString.h" 34 35 namespace WebCore { 36 37 class HTMLTokenizer { 38 WTF_MAKE_NONCOPYABLE(HTMLTokenizer); 39 WTF_MAKE_FAST_ALLOCATED; 40 public: 41 static PassOwnPtr<HTMLTokenizer> create(const HTMLParserOptions& options) { return adoptPtr(new HTMLTokenizer(options)); } 42 ~HTMLTokenizer(); 43 44 void reset(); 45 46 enum State { 47 DataState, 48 CharacterReferenceInDataState, 49 RCDATAState, 50 CharacterReferenceInRCDATAState, 51 RAWTEXTState, 52 ScriptDataState, 53 PLAINTEXTState, 54 TagOpenState, 55 EndTagOpenState, 56 TagNameState, 57 RCDATALessThanSignState, 58 RCDATAEndTagOpenState, 59 RCDATAEndTagNameState, 60 RAWTEXTLessThanSignState, 61 RAWTEXTEndTagOpenState, 62 RAWTEXTEndTagNameState, 63 ScriptDataLessThanSignState, 64 ScriptDataEndTagOpenState, 65 ScriptDataEndTagNameState, 66 ScriptDataEscapeStartState, 67 ScriptDataEscapeStartDashState, 68 ScriptDataEscapedState, 69 ScriptDataEscapedDashState, 70 ScriptDataEscapedDashDashState, 71 ScriptDataEscapedLessThanSignState, 72 ScriptDataEscapedEndTagOpenState, 73 ScriptDataEscapedEndTagNameState, 74 ScriptDataDoubleEscapeStartState, 75 ScriptDataDoubleEscapedState, 76 ScriptDataDoubleEscapedDashState, 77 ScriptDataDoubleEscapedDashDashState, 78 ScriptDataDoubleEscapedLessThanSignState, 79 ScriptDataDoubleEscapeEndState, 80 BeforeAttributeNameState, 81 AttributeNameState, 82 AfterAttributeNameState, 83 BeforeAttributeValueState, 84 AttributeValueDoubleQuotedState, 85 AttributeValueSingleQuotedState, 86 AttributeValueUnquotedState, 87 CharacterReferenceInAttributeValueState, 88 AfterAttributeValueQuotedState, 89 SelfClosingStartTagState, 90 BogusCommentState, 91 // The ContinueBogusCommentState is not in the HTML5 spec, but we use 92 // it internally to keep track of whether we've started the bogus 93 // comment token yet. 94 ContinueBogusCommentState, 95 MarkupDeclarationOpenState, 96 CommentStartState, 97 CommentStartDashState, 98 CommentState, 99 CommentEndDashState, 100 CommentEndState, 101 CommentEndBangState, 102 DOCTYPEState, 103 BeforeDOCTYPENameState, 104 DOCTYPENameState, 105 AfterDOCTYPENameState, 106 AfterDOCTYPEPublicKeywordState, 107 BeforeDOCTYPEPublicIdentifierState, 108 DOCTYPEPublicIdentifierDoubleQuotedState, 109 DOCTYPEPublicIdentifierSingleQuotedState, 110 AfterDOCTYPEPublicIdentifierState, 111 BetweenDOCTYPEPublicAndSystemIdentifiersState, 112 AfterDOCTYPESystemKeywordState, 113 BeforeDOCTYPESystemIdentifierState, 114 DOCTYPESystemIdentifierDoubleQuotedState, 115 DOCTYPESystemIdentifierSingleQuotedState, 116 AfterDOCTYPESystemIdentifierState, 117 BogusDOCTYPEState, 118 CDATASectionState, 119 // These CDATA states are not in the HTML5 spec, but we use them internally. 120 CDATASectionRightSquareBracketState, 121 CDATASectionDoubleRightSquareBracketState, 122 }; 123 124 struct Checkpoint { 125 HTMLParserOptions options; 126 State state; 127 UChar additionalAllowedCharacter; 128 bool skipNextNewLine; 129 bool forceNullCharacterReplacement; 130 bool shouldAllowCDATA; 131 132 Checkpoint() 133 : options(0) 134 , state() 135 , additionalAllowedCharacter('\0') 136 , skipNextNewLine(false) 137 , forceNullCharacterReplacement(false) 138 , shouldAllowCDATA(false) 139 { 140 } 141 }; 142 143 bool canCreateCheckpoint() const; 144 void createCheckpoint(Checkpoint&) const; 145 void restoreFromCheckpoint(const Checkpoint&); 146 147 // This function returns true if it emits a token. Otherwise, callers 148 // must provide the same (in progress) token on the next call (unless 149 // they call reset() first). 150 bool nextToken(SegmentedString&, HTMLToken&); 151 152 // Returns a copy of any characters buffered internally by the tokenizer. 153 // The tokenizer buffers characters when searching for the </script> token 154 // that terminates a script element. 155 String bufferedCharacters() const; 156 157 size_t numberOfBufferedCharacters() const 158 { 159 // Notice that we add 2 to the length of the m_temporaryBuffer to 160 // account for the "</" characters, which are effecitvely buffered in 161 // the tokenizer's state machine. 162 return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0; 163 } 164 165 // Updates the tokenizer's state according to the given tag name. This is 166 // an approximation of how the tree builder would update the tokenizer's 167 // state. This method is useful for approximating HTML tokenization. To 168 // get exactly the correct tokenization, you need the real tree builder. 169 // 170 // The main failures in the approximation are as follows: 171 // 172 // * The first set of character tokens emitted for a <pre> element might 173 // contain an extra leading newline. 174 // * The replacement of U+0000 with U+FFFD will not be sensitive to the 175 // tree builder's insertion mode. 176 // * CDATA sections in foreign content will be tokenized as bogus comments 177 // instead of as character tokens. 178 // 179 void updateStateFor(const AtomicString& tagName); 180 181 bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; } 182 void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; } 183 184 bool shouldAllowCDATA() const { return m_shouldAllowCDATA; } 185 void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; } 186 187 State state() const { return m_state; } 188 void setState(State state) { m_state = state; } 189 190 inline bool shouldSkipNullCharacters() const 191 { 192 return !m_forceNullCharacterReplacement 193 && (m_state == HTMLTokenizer::DataState 194 || m_state == HTMLTokenizer::RCDATAState 195 || m_state == HTMLTokenizer::RAWTEXTState); 196 } 197 198 private: 199 explicit HTMLTokenizer(const HTMLParserOptions&); 200 201 inline bool processEntity(SegmentedString&); 202 203 inline void parseError(); 204 205 inline void bufferCharacter(UChar character) 206 { 207 ASSERT(character != kEndOfFileMarker); 208 m_token->ensureIsCharacterToken(); 209 m_token->appendToCharacter(character); 210 } 211 212 inline bool emitAndResumeIn(SegmentedString& source, State state) 213 { 214 saveEndTagNameIfNeeded(); 215 m_state = state; 216 source.advanceAndUpdateLineNumber(); 217 return true; 218 } 219 220 inline bool emitAndReconsumeIn(SegmentedString&, State state) 221 { 222 saveEndTagNameIfNeeded(); 223 m_state = state; 224 return true; 225 } 226 227 inline bool emitEndOfFile(SegmentedString& source) 228 { 229 if (haveBufferedCharacterToken()) 230 return true; 231 m_state = HTMLTokenizer::DataState; 232 source.advanceAndUpdateLineNumber(); 233 m_token->clear(); 234 m_token->makeEndOfFile(); 235 return true; 236 } 237 238 inline bool flushEmitAndResumeIn(SegmentedString&, State); 239 240 // Return whether we need to emit a character token before dealing with 241 // the buffered end tag. 242 inline bool flushBufferedEndTag(SegmentedString&); 243 inline bool temporaryBufferIs(const String&); 244 245 // Sometimes we speculatively consume input characters and we don't 246 // know whether they represent end tags or RCDATA, etc. These 247 // functions help manage these state. 248 inline void addToPossibleEndTag(LChar cc); 249 250 inline void saveEndTagNameIfNeeded() 251 { 252 ASSERT(m_token->type() != HTMLToken::Uninitialized); 253 if (m_token->type() == HTMLToken::StartTag) 254 m_appropriateEndTagName = m_token->name(); 255 } 256 inline bool isAppropriateEndTag(); 257 258 259 inline bool haveBufferedCharacterToken() 260 { 261 return m_token->type() == HTMLToken::Character; 262 } 263 264 State m_state; 265 bool m_forceNullCharacterReplacement; 266 bool m_shouldAllowCDATA; 267 268 // m_token is owned by the caller. If nextToken is not on the stack, 269 // this member might be pointing to unallocated memory. 270 HTMLToken* m_token; 271 272 // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character 273 UChar m_additionalAllowedCharacter; 274 275 // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream 276 InputStreamPreprocessor<HTMLTokenizer> m_inputStreamPreprocessor; 277 278 Vector<UChar, 32> m_appropriateEndTagName; 279 280 // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer 281 Vector<LChar, 32> m_temporaryBuffer; 282 283 // We occationally want to emit both a character token and an end tag 284 // token (e.g., when lexing script). We buffer the name of the end tag 285 // token here so we remember it next time we re-enter the tokenizer. 286 Vector<LChar, 32> m_bufferedEndTagName; 287 288 HTMLParserOptions m_options; 289 }; 290 291 } 292 293 #endif 294