1 /* 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 3 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #ifndef HTMLTokenizer_h 28 #define HTMLTokenizer_h 29 30 #include "SegmentedString.h" 31 #include <wtf/Noncopyable.h> 32 #include <wtf/PassOwnPtr.h> 33 #include <wtf/Vector.h> 34 #include <wtf/text/AtomicString.h> 35 36 namespace WebCore { 37 38 class Element; 39 class Frame; 40 class HTMLToken; 41 42 class HTMLTokenizer { 43 WTF_MAKE_NONCOPYABLE(HTMLTokenizer); WTF_MAKE_FAST_ALLOCATED; 44 public: 45 enum State { 46 DataState, 47 CharacterReferenceInDataState, 48 RCDATAState, 49 CharacterReferenceInRCDATAState, 50 RAWTEXTState, 51 ScriptDataState, 52 PLAINTEXTState, 53 TagOpenState, 54 EndTagOpenState, 55 TagNameState, 56 RCDATALessThanSignState, 57 RCDATAEndTagOpenState, 58 RCDATAEndTagNameState, 59 RAWTEXTLessThanSignState, 60 RAWTEXTEndTagOpenState, 61 RAWTEXTEndTagNameState, 62 ScriptDataLessThanSignState, 63 ScriptDataEndTagOpenState, 64 ScriptDataEndTagNameState, 65 ScriptDataEscapeStartState, 66 ScriptDataEscapeStartDashState, 67 ScriptDataEscapedState, 68 ScriptDataEscapedDashState, 69 ScriptDataEscapedDashDashState, 70 ScriptDataEscapedLessThanSignState, 71 ScriptDataEscapedEndTagOpenState, 72 ScriptDataEscapedEndTagNameState, 73 ScriptDataDoubleEscapeStartState, 74 ScriptDataDoubleEscapedState, 75 ScriptDataDoubleEscapedDashState, 76 ScriptDataDoubleEscapedDashDashState, 77 ScriptDataDoubleEscapedLessThanSignState, 78 ScriptDataDoubleEscapeEndState, 79 BeforeAttributeNameState, 80 AttributeNameState, 81 AfterAttributeNameState, 82 BeforeAttributeValueState, 83 AttributeValueDoubleQuotedState, 84 AttributeValueSingleQuotedState, 85 AttributeValueUnquotedState, 86 CharacterReferenceInAttributeValueState, 87 AfterAttributeValueQuotedState, 88 SelfClosingStartTagState, 89 BogusCommentState, 90 // The ContinueBogusCommentState is not in the HTML5 spec, but we use 91 // it internally to keep track of whether we've started the bogus 92 // comment token yet. 93 ContinueBogusCommentState, 94 MarkupDeclarationOpenState, 95 CommentStartState, 96 CommentStartDashState, 97 CommentState, 98 CommentEndDashState, 99 CommentEndState, 100 CommentEndBangState, 101 DOCTYPEState, 102 BeforeDOCTYPENameState, 103 DOCTYPENameState, 104 AfterDOCTYPENameState, 105 AfterDOCTYPEPublicKeywordState, 106 BeforeDOCTYPEPublicIdentifierState, 107 DOCTYPEPublicIdentifierDoubleQuotedState, 108 DOCTYPEPublicIdentifierSingleQuotedState, 109 AfterDOCTYPEPublicIdentifierState, 110 BetweenDOCTYPEPublicAndSystemIdentifiersState, 111 AfterDOCTYPESystemKeywordState, 112 BeforeDOCTYPESystemIdentifierState, 113 DOCTYPESystemIdentifierDoubleQuotedState, 114 DOCTYPESystemIdentifierSingleQuotedState, 115 AfterDOCTYPESystemIdentifierState, 116 BogusDOCTYPEState, 117 CDATASectionState, 118 // These CDATA states are not in the HTML5 spec, but we use them internally. 119 CDATASectionRightSquareBracketState, 120 CDATASectionDoubleRightSquareBracketState, 121 }; 122 123 static PassOwnPtr<HTMLTokenizer> create(bool usePreHTML5ParserQuirks) { return adoptPtr(new HTMLTokenizer(usePreHTML5ParserQuirks)); } 124 ~HTMLTokenizer(); 125 126 void reset(); 127 128 // This function returns true if it emits a token. Otherwise, callers 129 // must provide the same (in progress) token on the next call (unless 130 // they call reset() first). 131 bool nextToken(SegmentedString&, HTMLToken&); 132 133 int lineNumber() const { return m_lineNumber; } 134 int columnNumber() const { return 1; } // Matches LegacyHTMLDocumentParser.h behavior. 135 136 State state() const { return m_state; } 137 void setState(State state) { m_state = state; } 138 139 // Updates the tokenizer's state according to the given tag name. This is 140 // an approximation of how the tree builder would update the tokenizer's 141 // state. This method is useful for approximating HTML tokenization. To 142 // get exactly the correct tokenization, you need the real tree builder. 143 // 144 // The main failures in the approximation are as follows: 145 // 146 // * The first set of character tokens emitted for a <pre> element might 147 // contain an extra leading newline. 148 // * The replacement of U+0000 with U+FFFD will not be sensitive to the 149 // tree builder's insertion mode. 150 // * CDATA sections in foreign content will be tokenized as bogus comments 151 // instead of as character tokens. 152 // 153 void updateStateFor(const AtomicString& tagName, Frame*); 154 155 // Hack to skip leading newline in <pre>/<listing> for authoring ease. 156 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody 157 void setSkipLeadingNewLineForListing(bool value) { m_skipLeadingNewLineForListing = value; } 158 159 bool forceNullCharacterReplacement() const { return m_forceNullCharacterReplacement; } 160 void setForceNullCharacterReplacement(bool value) { m_forceNullCharacterReplacement = value; } 161 162 bool shouldAllowCDATA() const { return m_shouldAllowCDATA; } 163 void setShouldAllowCDATA(bool value) { m_shouldAllowCDATA = value; } 164 165 bool shouldSkipNullCharacters() const 166 { 167 return !m_forceNullCharacterReplacement 168 && (m_state == DataState 169 || m_state == RCDATAState 170 || m_state == RAWTEXTState 171 || m_state == PLAINTEXTState); 172 } 173 174 private: 175 // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream 176 class InputStreamPreprocessor { 177 WTF_MAKE_NONCOPYABLE(InputStreamPreprocessor); 178 public: 179 InputStreamPreprocessor(HTMLTokenizer* tokenizer) 180 : m_tokenizer(tokenizer) 181 , m_nextInputCharacter('\0') 182 , m_skipNextNewLine(false) 183 { 184 } 185 186 UChar nextInputCharacter() const { return m_nextInputCharacter; } 187 188 // Returns whether we succeeded in peeking at the next character. 189 // The only way we can fail to peek is if there are no more 190 // characters in |source| (after collapsing \r\n, etc). 191 ALWAYS_INLINE bool peek(SegmentedString& source, int& lineNumber) 192 { 193 PeekAgain: 194 m_nextInputCharacter = *source; 195 196 // Every branch in this function is expensive, so we have a 197 // fast-reject branch for characters that don't require special 198 // handling. Please run the parser benchmark whenever you touch 199 // this function. It's very hot. 200 static const UChar specialCharacterMask = '\n' | '\r' | '\0'; 201 if (m_nextInputCharacter & ~specialCharacterMask) { 202 m_skipNextNewLine = false; 203 return true; 204 } 205 206 if (m_nextInputCharacter == '\n' && m_skipNextNewLine) { 207 m_skipNextNewLine = false; 208 source.advancePastNewline(lineNumber); 209 if (source.isEmpty()) 210 return false; 211 m_nextInputCharacter = *source; 212 } 213 if (m_nextInputCharacter == '\r') { 214 m_nextInputCharacter = '\n'; 215 m_skipNextNewLine = true; 216 } else { 217 m_skipNextNewLine = false; 218 // FIXME: The spec indicates that the surrogate pair range as well as 219 // a number of specific character values are parse errors and should be replaced 220 // by the replacement character. We suspect this is a problem with the spec as doing 221 // that filtering breaks surrogate pair handling and causes us not to match Minefield. 222 if (m_nextInputCharacter == '\0' && !shouldTreatNullAsEndOfFileMarker(source)) { 223 if (m_tokenizer->shouldSkipNullCharacters()) { 224 source.advancePastNonNewline(); 225 if (source.isEmpty()) 226 return false; 227 goto PeekAgain; 228 } 229 m_nextInputCharacter = 0xFFFD; 230 } 231 } 232 return true; 233 } 234 235 // Returns whether there are more characters in |source| after advancing. 236 bool advance(SegmentedString& source, int& lineNumber) 237 { 238 source.advance(lineNumber); 239 if (source.isEmpty()) 240 return false; 241 return peek(source, lineNumber); 242 } 243 244 static const UChar endOfFileMarker; 245 246 private: 247 bool shouldTreatNullAsEndOfFileMarker(SegmentedString& source) const 248 { 249 return source.isClosed() && source.length() == 1; 250 } 251 252 HTMLTokenizer* m_tokenizer; 253 254 // http://www.whatwg.org/specs/web-apps/current-work/#next-input-character 255 UChar m_nextInputCharacter; 256 bool m_skipNextNewLine; 257 }; 258 259 HTMLTokenizer(bool usePreHTML5ParserQuirks); 260 261 inline bool processEntity(SegmentedString&); 262 263 inline void parseError(); 264 inline void bufferCharacter(UChar); 265 inline void bufferCodePoint(unsigned); 266 267 inline bool emitAndResumeIn(SegmentedString&, State); 268 inline bool emitAndReconsumeIn(SegmentedString&, State); 269 inline bool emitEndOfFile(SegmentedString&); 270 inline bool flushEmitAndResumeIn(SegmentedString&, State); 271 272 // Return whether we need to emit a character token before dealing with 273 // the buffered end tag. 274 inline bool flushBufferedEndTag(SegmentedString&); 275 inline bool temporaryBufferIs(const String&); 276 277 // Sometimes we speculatively consume input characters and we don't 278 // know whether they represent end tags or RCDATA, etc. These 279 // functions help manage these state. 280 inline void addToPossibleEndTag(UChar cc); 281 inline void saveEndTagNameIfNeeded(); 282 inline bool isAppropriateEndTag(); 283 284 inline bool haveBufferedCharacterToken(); 285 286 State m_state; 287 288 Vector<UChar, 32> m_appropriateEndTagName; 289 290 // m_token is owned by the caller. If nextToken is not on the stack, 291 // this member might be pointing to unallocated memory. 292 HTMLToken* m_token; 293 int m_lineNumber; 294 295 bool m_skipLeadingNewLineForListing; 296 bool m_forceNullCharacterReplacement; 297 bool m_shouldAllowCDATA; 298 299 // http://www.whatwg.org/specs/web-apps/current-work/#temporary-buffer 300 Vector<UChar, 32> m_temporaryBuffer; 301 302 // We occationally want to emit both a character token and an end tag 303 // token (e.g., when lexing script). We buffer the name of the end tag 304 // token here so we remember it next time we re-enter the tokenizer. 305 Vector<UChar, 32> m_bufferedEndTagName; 306 307 // http://www.whatwg.org/specs/web-apps/current-work/#additional-allowed-character 308 UChar m_additionalAllowedCharacter; 309 310 // http://www.whatwg.org/specs/web-apps/current-work/#preprocessing-the-input-stream 311 InputStreamPreprocessor m_inputStreamPreprocessor; 312 313 bool m_usePreHTML5ParserQuirks; 314 }; 315 316 } 317 318 #endif 319