1 /* 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ 4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 #include "HTMLTokenizer.h" 30 31 #include "HTMLEntityParser.h" 32 #include "HTMLToken.h" 33 #include "HTMLTreeBuilder.h" 34 #include "HTMLNames.h" 35 #include "NotImplemented.h" 36 #include <wtf/ASCIICType.h> 37 #include <wtf/CurrentTime.h> 38 #include <wtf/UnusedParam.h> 39 #include <wtf/text/AtomicString.h> 40 #include <wtf/text/CString.h> 41 #include <wtf/unicode/Unicode.h> 42 43 using namespace WTF; 44 45 namespace WebCore { 46 47 using namespace HTMLNames; 48 49 const UChar HTMLTokenizer::InputStreamPreprocessor::endOfFileMarker = 0; 50 51 namespace { 52 53 inline UChar toLowerCase(UChar cc) 54 { 55 ASSERT(isASCIIUpper(cc)); 56 const int lowerCaseOffset = 0x20; 57 return cc + lowerCaseOffset; 58 } 59 60 inline bool isTokenizerWhitespace(UChar cc) 61 { 62 return cc == ' ' || cc == '\x0A' || cc == '\x09' || cc == '\x0C'; 63 } 64 65 inline void advanceStringAndASSERTIgnoringCase(SegmentedString& source, const char* expectedCharacters) 66 { 67 while (*expectedCharacters) 68 source.advanceAndASSERTIgnoringCase(*expectedCharacters++); 69 } 70 71 inline void advanceStringAndASSERT(SegmentedString& source, const char* expectedCharacters) 72 { 73 while (*expectedCharacters) 74 source.advanceAndASSERT(*expectedCharacters++); 75 } 76 77 inline bool vectorEqualsString(const Vector<UChar, 32>& vector, const String& string) 78 { 79 if (vector.size() != string.length()) 80 return false; 81 const UChar* stringData = string.characters(); 82 const UChar* vectorData = vector.data(); 83 // FIXME: Is there a higher-level function we should be calling here? 84 return !memcmp(stringData, vectorData, vector.size() * sizeof(UChar)); 85 } 86 87 inline bool isEndTagBufferingState(HTMLTokenizer::State state) 88 { 89 switch (state) { 90 case HTMLTokenizer::RCDATAEndTagOpenState: 91 case HTMLTokenizer::RCDATAEndTagNameState: 92 case HTMLTokenizer::RAWTEXTEndTagOpenState: 93 case HTMLTokenizer::RAWTEXTEndTagNameState: 94 case HTMLTokenizer::ScriptDataEndTagOpenState: 95 case HTMLTokenizer::ScriptDataEndTagNameState: 96 case HTMLTokenizer::ScriptDataEscapedEndTagOpenState: 97 case HTMLTokenizer::ScriptDataEscapedEndTagNameState: 98 return true; 99 default: 100 return false; 101 } 102 } 103 104 } 105 106 HTMLTokenizer::HTMLTokenizer(bool usePreHTML5ParserQuirks) 107 : m_inputStreamPreprocessor(this) 108 , m_usePreHTML5ParserQuirks(usePreHTML5ParserQuirks) 109 { 110 reset(); 111 } 112 113 HTMLTokenizer::~HTMLTokenizer() 114 { 115 } 116 117 void HTMLTokenizer::reset() 118 { 119 m_state = DataState; 120 m_token = 0; 121 m_lineNumber = 0; 122 m_skipLeadingNewLineForListing = false; 123 m_forceNullCharacterReplacement = false; 124 m_shouldAllowCDATA = false; 125 m_additionalAllowedCharacter = '\0'; 126 } 127 128 inline bool HTMLTokenizer::processEntity(SegmentedString& source) 129 { 130 bool notEnoughCharacters = false; 131 Vector<UChar, 16> decodedEntity; 132 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters); 133 if (notEnoughCharacters) 134 return false; 135 if (!success) { 136 ASSERT(decodedEntity.isEmpty()); 137 bufferCharacter('&'); 138 } else { 139 Vector<UChar>::const_iterator iter = decodedEntity.begin(); 140 for (; iter != decodedEntity.end(); ++iter) 141 bufferCharacter(*iter); 142 } 143 return true; 144 } 145 146 #if COMPILER(MSVC) 147 // We need to disable the "unreachable code" warning because we want to assert 148 // that some code points aren't reached in the state machine. 149 #pragma warning(disable: 4702) 150 #endif 151 152 #define BEGIN_STATE(stateName) case stateName: stateName: 153 #define END_STATE() ASSERT_NOT_REACHED(); break; 154 155 // We use this macro when the HTML5 spec says "reconsume the current input 156 // character in the <mumble> state." 157 #define RECONSUME_IN(stateName) \ 158 do { \ 159 m_state = stateName; \ 160 goto stateName; \ 161 } while (false) 162 163 // We use this macro when the HTML5 spec says "consume the next input 164 // character ... and switch to the <mumble> state." 165 #define ADVANCE_TO(stateName) \ 166 do { \ 167 m_state = stateName; \ 168 if (!m_inputStreamPreprocessor.advance(source, m_lineNumber)) \ 169 return haveBufferedCharacterToken(); \ 170 cc = m_inputStreamPreprocessor.nextInputCharacter(); \ 171 goto stateName; \ 172 } while (false) 173 174 // Sometimes there's more complicated logic in the spec that separates when 175 // we consume the next input character and when we switch to a particular 176 // state. We handle those cases by advancing the source directly and using 177 // this macro to switch to the indicated state. 178 #define SWITCH_TO(stateName) \ 179 do { \ 180 m_state = stateName; \ 181 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source, m_lineNumber)) \ 182 return haveBufferedCharacterToken(); \ 183 cc = m_inputStreamPreprocessor.nextInputCharacter(); \ 184 goto stateName; \ 185 } while (false) 186 187 188 inline void HTMLTokenizer::saveEndTagNameIfNeeded() 189 { 190 ASSERT(m_token->type() != HTMLToken::Uninitialized); 191 if (m_token->type() == HTMLToken::StartTag) 192 m_appropriateEndTagName = m_token->name(); 193 } 194 195 // We use this function when the HTML5 spec says "Emit the current <mumble> 196 // token. Switch to the <mumble> state." We use the word "resume" instead of 197 // switch to indicate that this macro actually returns and that we'll end up 198 // in the state when we "resume" (i.e., are called again). 199 bool HTMLTokenizer::emitAndResumeIn(SegmentedString& source, State state) 200 { 201 m_state = state; 202 source.advance(m_lineNumber); 203 saveEndTagNameIfNeeded(); 204 return true; 205 } 206 207 // Identical to emitAndResumeIn, except does not advance. 208 bool HTMLTokenizer::emitAndReconsumeIn(SegmentedString&, State state) 209 { 210 m_state = state; 211 saveEndTagNameIfNeeded(); 212 return true; 213 } 214 215 // Used to emit the EndOfFile token. 216 // Check if we have buffered characters to emit first before emitting the EOF. 217 bool HTMLTokenizer::emitEndOfFile(SegmentedString& source) 218 { 219 if (haveBufferedCharacterToken()) 220 return true; 221 m_state = DataState; 222 source.advance(m_lineNumber); 223 m_token->clear(); 224 m_token->makeEndOfFile(); 225 return true; 226 } 227 228 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source) 229 { 230 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized); 231 source.advance(m_lineNumber); 232 if (m_token->type() == HTMLToken::Character) 233 return true; 234 m_token->beginEndTag(m_bufferedEndTagName); 235 m_bufferedEndTagName.clear(); 236 return false; 237 } 238 239 #define FLUSH_AND_ADVANCE_TO(stateName) \ 240 do { \ 241 m_state = stateName; \ 242 if (flushBufferedEndTag(source)) \ 243 return true; \ 244 if (source.isEmpty() \ 245 || !m_inputStreamPreprocessor.peek(source, m_lineNumber)) \ 246 return haveBufferedCharacterToken(); \ 247 cc = m_inputStreamPreprocessor.nextInputCharacter(); \ 248 goto stateName; \ 249 } while (false) 250 251 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, State state) 252 { 253 m_state = state; 254 flushBufferedEndTag(source); 255 return true; 256 } 257 258 bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token) 259 { 260 // If we have a token in progress, then we're supposed to be called back 261 // with the same token so we can finish it. 262 ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized); 263 m_token = &token; 264 265 if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) { 266 // FIXME: This should call flushBufferedEndTag(). 267 // We started an end tag during our last iteration. 268 m_token->beginEndTag(m_bufferedEndTagName); 269 m_bufferedEndTagName.clear(); 270 if (m_state == DataState) { 271 // We're back in the data state, so we must be done with the tag. 272 return true; 273 } 274 } 275 276 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source, m_lineNumber)) 277 return haveBufferedCharacterToken(); 278 UChar cc = m_inputStreamPreprocessor.nextInputCharacter(); 279 280 // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#parsing-main-inbody 281 // Note that this logic is different than the generic \r\n collapsing 282 // handled in the input stream preprocessor. This logic is here as an 283 // "authoring convenience" so folks can write: 284 // 285 // <pre> 286 // lorem ipsum 287 // lorem ipsum 288 // </pre> 289 // 290 // without getting an extra newline at the start of their <pre> element. 291 if (m_skipLeadingNewLineForListing) { 292 m_skipLeadingNewLineForListing = false; 293 if (cc == '\n') { 294 if (m_state == DataState) 295 ADVANCE_TO(DataState); 296 if (m_state == RCDATAState) 297 ADVANCE_TO(RCDATAState); 298 // When parsing text/plain documents, we run the tokenizer in the 299 // PLAINTEXTState and ignore m_skipLeadingNewLineForListing. 300 ASSERT(m_state == PLAINTEXTState); 301 } 302 } 303 304 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0 305 switch (m_state) { 306 BEGIN_STATE(DataState) { 307 if (cc == '&') 308 ADVANCE_TO(CharacterReferenceInDataState); 309 else if (cc == '<') { 310 if (m_token->type() == HTMLToken::Character) { 311 // We have a bunch of character tokens queued up that we 312 // are emitting lazily here. 313 return true; 314 } 315 ADVANCE_TO(TagOpenState); 316 } else if (cc == InputStreamPreprocessor::endOfFileMarker) 317 return emitEndOfFile(source); 318 else { 319 bufferCharacter(cc); 320 ADVANCE_TO(DataState); 321 } 322 } 323 END_STATE() 324 325 BEGIN_STATE(CharacterReferenceInDataState) { 326 if (!processEntity(source)) 327 return haveBufferedCharacterToken(); 328 SWITCH_TO(DataState); 329 } 330 END_STATE() 331 332 BEGIN_STATE(RCDATAState) { 333 if (cc == '&') 334 ADVANCE_TO(CharacterReferenceInRCDATAState); 335 else if (cc == '<') 336 ADVANCE_TO(RCDATALessThanSignState); 337 else if (cc == InputStreamPreprocessor::endOfFileMarker) 338 return emitEndOfFile(source); 339 else { 340 bufferCharacter(cc); 341 ADVANCE_TO(RCDATAState); 342 } 343 } 344 END_STATE() 345 346 BEGIN_STATE(CharacterReferenceInRCDATAState) { 347 if (!processEntity(source)) 348 return haveBufferedCharacterToken(); 349 SWITCH_TO(RCDATAState); 350 } 351 END_STATE() 352 353 BEGIN_STATE(RAWTEXTState) { 354 if (cc == '<') 355 ADVANCE_TO(RAWTEXTLessThanSignState); 356 else if (cc == InputStreamPreprocessor::endOfFileMarker) 357 return emitEndOfFile(source); 358 else { 359 bufferCharacter(cc); 360 ADVANCE_TO(RAWTEXTState); 361 } 362 } 363 END_STATE() 364 365 BEGIN_STATE(ScriptDataState) { 366 if (cc == '<') 367 ADVANCE_TO(ScriptDataLessThanSignState); 368 else if (cc == InputStreamPreprocessor::endOfFileMarker) 369 return emitEndOfFile(source); 370 else { 371 bufferCharacter(cc); 372 ADVANCE_TO(ScriptDataState); 373 } 374 } 375 END_STATE() 376 377 BEGIN_STATE(PLAINTEXTState) { 378 if (cc == InputStreamPreprocessor::endOfFileMarker) 379 return emitEndOfFile(source); 380 else 381 bufferCharacter(cc); 382 ADVANCE_TO(PLAINTEXTState); 383 } 384 END_STATE() 385 386 BEGIN_STATE(TagOpenState) { 387 if (cc == '!') 388 ADVANCE_TO(MarkupDeclarationOpenState); 389 else if (cc == '/') 390 ADVANCE_TO(EndTagOpenState); 391 else if (isASCIIUpper(cc)) { 392 m_token->beginStartTag(toLowerCase(cc)); 393 ADVANCE_TO(TagNameState); 394 } else if (isASCIILower(cc)) { 395 m_token->beginStartTag(cc); 396 ADVANCE_TO(TagNameState); 397 } else if (cc == '?') { 398 parseError(); 399 // The spec consumes the current character before switching 400 // to the bogus comment state, but it's easier to implement 401 // if we reconsume the current character. 402 RECONSUME_IN(BogusCommentState); 403 } else { 404 parseError(); 405 bufferCharacter('<'); 406 RECONSUME_IN(DataState); 407 } 408 } 409 END_STATE() 410 411 BEGIN_STATE(EndTagOpenState) { 412 if (isASCIIUpper(cc)) { 413 m_token->beginEndTag(toLowerCase(cc)); 414 ADVANCE_TO(TagNameState); 415 } else if (isASCIILower(cc)) { 416 m_token->beginEndTag(cc); 417 ADVANCE_TO(TagNameState); 418 } else if (cc == '>') { 419 parseError(); 420 ADVANCE_TO(DataState); 421 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 422 parseError(); 423 bufferCharacter('<'); 424 bufferCharacter('/'); 425 RECONSUME_IN(DataState); 426 } else { 427 parseError(); 428 RECONSUME_IN(BogusCommentState); 429 } 430 } 431 END_STATE() 432 433 BEGIN_STATE(TagNameState) { 434 if (isTokenizerWhitespace(cc)) 435 ADVANCE_TO(BeforeAttributeNameState); 436 else if (cc == '/') 437 ADVANCE_TO(SelfClosingStartTagState); 438 else if (cc == '>') 439 return emitAndResumeIn(source, DataState); 440 else if (m_usePreHTML5ParserQuirks && cc == '<') 441 return emitAndReconsumeIn(source, DataState); 442 else if (isASCIIUpper(cc)) { 443 m_token->appendToName(toLowerCase(cc)); 444 ADVANCE_TO(TagNameState); 445 } if (cc == InputStreamPreprocessor::endOfFileMarker) { 446 parseError(); 447 RECONSUME_IN(DataState); 448 } else { 449 m_token->appendToName(cc); 450 ADVANCE_TO(TagNameState); 451 } 452 } 453 END_STATE() 454 455 BEGIN_STATE(RCDATALessThanSignState) { 456 if (cc == '/') { 457 m_temporaryBuffer.clear(); 458 ASSERT(m_bufferedEndTagName.isEmpty()); 459 ADVANCE_TO(RCDATAEndTagOpenState); 460 } else { 461 bufferCharacter('<'); 462 RECONSUME_IN(RCDATAState); 463 } 464 } 465 END_STATE() 466 467 BEGIN_STATE(RCDATAEndTagOpenState) { 468 if (isASCIIUpper(cc)) { 469 m_temporaryBuffer.append(cc); 470 addToPossibleEndTag(toLowerCase(cc)); 471 ADVANCE_TO(RCDATAEndTagNameState); 472 } else if (isASCIILower(cc)) { 473 m_temporaryBuffer.append(cc); 474 addToPossibleEndTag(cc); 475 ADVANCE_TO(RCDATAEndTagNameState); 476 } else { 477 bufferCharacter('<'); 478 bufferCharacter('/'); 479 RECONSUME_IN(RCDATAState); 480 } 481 } 482 END_STATE() 483 484 BEGIN_STATE(RCDATAEndTagNameState) { 485 if (isASCIIUpper(cc)) { 486 m_temporaryBuffer.append(cc); 487 addToPossibleEndTag(toLowerCase(cc)); 488 ADVANCE_TO(RCDATAEndTagNameState); 489 } else if (isASCIILower(cc)) { 490 m_temporaryBuffer.append(cc); 491 addToPossibleEndTag(cc); 492 ADVANCE_TO(RCDATAEndTagNameState); 493 } else { 494 if (isTokenizerWhitespace(cc)) { 495 if (isAppropriateEndTag()) 496 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 497 } else if (cc == '/') { 498 if (isAppropriateEndTag()) 499 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 500 } else if (cc == '>') { 501 if (isAppropriateEndTag()) 502 return flushEmitAndResumeIn(source, DataState); 503 } 504 bufferCharacter('<'); 505 bufferCharacter('/'); 506 m_token->appendToCharacter(m_temporaryBuffer); 507 m_bufferedEndTagName.clear(); 508 RECONSUME_IN(RCDATAState); 509 } 510 } 511 END_STATE() 512 513 BEGIN_STATE(RAWTEXTLessThanSignState) { 514 if (cc == '/') { 515 m_temporaryBuffer.clear(); 516 ASSERT(m_bufferedEndTagName.isEmpty()); 517 ADVANCE_TO(RAWTEXTEndTagOpenState); 518 } else { 519 bufferCharacter('<'); 520 RECONSUME_IN(RAWTEXTState); 521 } 522 } 523 END_STATE() 524 525 BEGIN_STATE(RAWTEXTEndTagOpenState) { 526 if (isASCIIUpper(cc)) { 527 m_temporaryBuffer.append(cc); 528 addToPossibleEndTag(toLowerCase(cc)); 529 ADVANCE_TO(RAWTEXTEndTagNameState); 530 } else if (isASCIILower(cc)) { 531 m_temporaryBuffer.append(cc); 532 addToPossibleEndTag(cc); 533 ADVANCE_TO(RAWTEXTEndTagNameState); 534 } else { 535 bufferCharacter('<'); 536 bufferCharacter('/'); 537 RECONSUME_IN(RAWTEXTState); 538 } 539 } 540 END_STATE() 541 542 BEGIN_STATE(RAWTEXTEndTagNameState) { 543 if (isASCIIUpper(cc)) { 544 m_temporaryBuffer.append(cc); 545 addToPossibleEndTag(toLowerCase(cc)); 546 ADVANCE_TO(RAWTEXTEndTagNameState); 547 } else if (isASCIILower(cc)) { 548 m_temporaryBuffer.append(cc); 549 addToPossibleEndTag(cc); 550 ADVANCE_TO(RAWTEXTEndTagNameState); 551 } else { 552 if (isTokenizerWhitespace(cc)) { 553 if (isAppropriateEndTag()) 554 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 555 } else if (cc == '/') { 556 if (isAppropriateEndTag()) 557 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 558 } else if (cc == '>') { 559 if (isAppropriateEndTag()) 560 return flushEmitAndResumeIn(source, DataState); 561 } 562 bufferCharacter('<'); 563 bufferCharacter('/'); 564 m_token->appendToCharacter(m_temporaryBuffer); 565 m_bufferedEndTagName.clear(); 566 RECONSUME_IN(RAWTEXTState); 567 } 568 } 569 END_STATE() 570 571 BEGIN_STATE(ScriptDataLessThanSignState) { 572 if (cc == '/') { 573 m_temporaryBuffer.clear(); 574 ASSERT(m_bufferedEndTagName.isEmpty()); 575 ADVANCE_TO(ScriptDataEndTagOpenState); 576 } else if (cc == '!') { 577 bufferCharacter('<'); 578 bufferCharacter('!'); 579 ADVANCE_TO(ScriptDataEscapeStartState); 580 } else { 581 bufferCharacter('<'); 582 RECONSUME_IN(ScriptDataState); 583 } 584 } 585 END_STATE() 586 587 BEGIN_STATE(ScriptDataEndTagOpenState) { 588 if (isASCIIUpper(cc)) { 589 m_temporaryBuffer.append(cc); 590 addToPossibleEndTag(toLowerCase(cc)); 591 ADVANCE_TO(ScriptDataEndTagNameState); 592 } else if (isASCIILower(cc)) { 593 m_temporaryBuffer.append(cc); 594 addToPossibleEndTag(cc); 595 ADVANCE_TO(ScriptDataEndTagNameState); 596 } else { 597 bufferCharacter('<'); 598 bufferCharacter('/'); 599 RECONSUME_IN(ScriptDataState); 600 } 601 } 602 END_STATE() 603 604 BEGIN_STATE(ScriptDataEndTagNameState) { 605 if (isASCIIUpper(cc)) { 606 m_temporaryBuffer.append(cc); 607 addToPossibleEndTag(toLowerCase(cc)); 608 ADVANCE_TO(ScriptDataEndTagNameState); 609 } else if (isASCIILower(cc)) { 610 m_temporaryBuffer.append(cc); 611 addToPossibleEndTag(cc); 612 ADVANCE_TO(ScriptDataEndTagNameState); 613 } else { 614 if (isTokenizerWhitespace(cc)) { 615 if (isAppropriateEndTag()) 616 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 617 } else if (cc == '/') { 618 if (isAppropriateEndTag()) 619 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 620 } else if (cc == '>') { 621 if (isAppropriateEndTag()) 622 return flushEmitAndResumeIn(source, DataState); 623 } 624 bufferCharacter('<'); 625 bufferCharacter('/'); 626 m_token->appendToCharacter(m_temporaryBuffer); 627 m_bufferedEndTagName.clear(); 628 RECONSUME_IN(ScriptDataState); 629 } 630 } 631 END_STATE() 632 633 BEGIN_STATE(ScriptDataEscapeStartState) { 634 if (cc == '-') { 635 bufferCharacter(cc); 636 ADVANCE_TO(ScriptDataEscapeStartDashState); 637 } else 638 RECONSUME_IN(ScriptDataState); 639 } 640 END_STATE() 641 642 BEGIN_STATE(ScriptDataEscapeStartDashState) { 643 if (cc == '-') { 644 bufferCharacter(cc); 645 ADVANCE_TO(ScriptDataEscapedDashDashState); 646 } else 647 RECONSUME_IN(ScriptDataState); 648 } 649 END_STATE() 650 651 BEGIN_STATE(ScriptDataEscapedState) { 652 if (cc == '-') { 653 bufferCharacter(cc); 654 ADVANCE_TO(ScriptDataEscapedDashState); 655 } else if (cc == '<') 656 ADVANCE_TO(ScriptDataEscapedLessThanSignState); 657 else if (cc == InputStreamPreprocessor::endOfFileMarker) { 658 parseError(); 659 RECONSUME_IN(DataState); 660 } else { 661 bufferCharacter(cc); 662 ADVANCE_TO(ScriptDataEscapedState); 663 } 664 } 665 END_STATE() 666 667 BEGIN_STATE(ScriptDataEscapedDashState) { 668 if (cc == '-') { 669 bufferCharacter(cc); 670 ADVANCE_TO(ScriptDataEscapedDashDashState); 671 } else if (cc == '<') 672 ADVANCE_TO(ScriptDataEscapedLessThanSignState); 673 else if (cc == InputStreamPreprocessor::endOfFileMarker) { 674 parseError(); 675 RECONSUME_IN(DataState); 676 } else { 677 bufferCharacter(cc); 678 ADVANCE_TO(ScriptDataEscapedState); 679 } 680 } 681 END_STATE() 682 683 BEGIN_STATE(ScriptDataEscapedDashDashState) { 684 if (cc == '-') { 685 bufferCharacter(cc); 686 ADVANCE_TO(ScriptDataEscapedDashDashState); 687 } else if (cc == '<') 688 ADVANCE_TO(ScriptDataEscapedLessThanSignState); 689 else if (cc == '>') { 690 bufferCharacter(cc); 691 ADVANCE_TO(ScriptDataState); 692 } if (cc == InputStreamPreprocessor::endOfFileMarker) { 693 parseError(); 694 RECONSUME_IN(DataState); 695 } else { 696 bufferCharacter(cc); 697 ADVANCE_TO(ScriptDataEscapedState); 698 } 699 } 700 END_STATE() 701 702 BEGIN_STATE(ScriptDataEscapedLessThanSignState) { 703 if (cc == '/') { 704 m_temporaryBuffer.clear(); 705 ASSERT(m_bufferedEndTagName.isEmpty()); 706 ADVANCE_TO(ScriptDataEscapedEndTagOpenState); 707 } else if (isASCIIUpper(cc)) { 708 bufferCharacter('<'); 709 bufferCharacter(cc); 710 m_temporaryBuffer.clear(); 711 m_temporaryBuffer.append(toLowerCase(cc)); 712 ADVANCE_TO(ScriptDataDoubleEscapeStartState); 713 } else if (isASCIILower(cc)) { 714 bufferCharacter('<'); 715 bufferCharacter(cc); 716 m_temporaryBuffer.clear(); 717 m_temporaryBuffer.append(cc); 718 ADVANCE_TO(ScriptDataDoubleEscapeStartState); 719 } else { 720 bufferCharacter('<'); 721 RECONSUME_IN(ScriptDataEscapedState); 722 } 723 } 724 END_STATE() 725 726 BEGIN_STATE(ScriptDataEscapedEndTagOpenState) { 727 if (isASCIIUpper(cc)) { 728 m_temporaryBuffer.append(cc); 729 addToPossibleEndTag(toLowerCase(cc)); 730 ADVANCE_TO(ScriptDataEscapedEndTagNameState); 731 } else if (isASCIILower(cc)) { 732 m_temporaryBuffer.append(cc); 733 addToPossibleEndTag(cc); 734 ADVANCE_TO(ScriptDataEscapedEndTagNameState); 735 } else { 736 bufferCharacter('<'); 737 bufferCharacter('/'); 738 RECONSUME_IN(ScriptDataEscapedState); 739 } 740 } 741 END_STATE() 742 743 BEGIN_STATE(ScriptDataEscapedEndTagNameState) { 744 if (isASCIIUpper(cc)) { 745 m_temporaryBuffer.append(cc); 746 addToPossibleEndTag(toLowerCase(cc)); 747 ADVANCE_TO(ScriptDataEscapedEndTagNameState); 748 } else if (isASCIILower(cc)) { 749 m_temporaryBuffer.append(cc); 750 addToPossibleEndTag(cc); 751 ADVANCE_TO(ScriptDataEscapedEndTagNameState); 752 } else { 753 if (isTokenizerWhitespace(cc)) { 754 if (isAppropriateEndTag()) 755 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 756 } else if (cc == '/') { 757 if (isAppropriateEndTag()) 758 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 759 } else if (cc == '>') { 760 if (isAppropriateEndTag()) 761 return flushEmitAndResumeIn(source, DataState); 762 } 763 bufferCharacter('<'); 764 bufferCharacter('/'); 765 m_token->appendToCharacter(m_temporaryBuffer); 766 m_bufferedEndTagName.clear(); 767 RECONSUME_IN(ScriptDataEscapedState); 768 } 769 } 770 END_STATE() 771 772 BEGIN_STATE(ScriptDataDoubleEscapeStartState) { 773 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') { 774 bufferCharacter(cc); 775 if (temporaryBufferIs(scriptTag.localName())) 776 ADVANCE_TO(ScriptDataDoubleEscapedState); 777 else 778 ADVANCE_TO(ScriptDataEscapedState); 779 } else if (isASCIIUpper(cc)) { 780 bufferCharacter(cc); 781 m_temporaryBuffer.append(toLowerCase(cc)); 782 ADVANCE_TO(ScriptDataDoubleEscapeStartState); 783 } else if (isASCIILower(cc)) { 784 bufferCharacter(cc); 785 m_temporaryBuffer.append(cc); 786 ADVANCE_TO(ScriptDataDoubleEscapeStartState); 787 } else 788 RECONSUME_IN(ScriptDataEscapedState); 789 } 790 END_STATE() 791 792 BEGIN_STATE(ScriptDataDoubleEscapedState) { 793 if (cc == '-') { 794 bufferCharacter(cc); 795 ADVANCE_TO(ScriptDataDoubleEscapedDashState); 796 } else if (cc == '<') { 797 bufferCharacter(cc); 798 ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 799 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 800 parseError(); 801 RECONSUME_IN(DataState); 802 } else { 803 bufferCharacter(cc); 804 ADVANCE_TO(ScriptDataDoubleEscapedState); 805 } 806 } 807 END_STATE() 808 809 BEGIN_STATE(ScriptDataDoubleEscapedDashState) { 810 if (cc == '-') { 811 bufferCharacter(cc); 812 ADVANCE_TO(ScriptDataDoubleEscapedDashDashState); 813 } else if (cc == '<') { 814 bufferCharacter(cc); 815 ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 816 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 817 parseError(); 818 RECONSUME_IN(DataState); 819 } else { 820 bufferCharacter(cc); 821 ADVANCE_TO(ScriptDataDoubleEscapedState); 822 } 823 } 824 END_STATE() 825 826 BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) { 827 if (cc == '-') { 828 bufferCharacter(cc); 829 ADVANCE_TO(ScriptDataDoubleEscapedDashDashState); 830 } else if (cc == '<') { 831 bufferCharacter(cc); 832 ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 833 } else if (cc == '>') { 834 bufferCharacter(cc); 835 ADVANCE_TO(ScriptDataState); 836 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 837 parseError(); 838 RECONSUME_IN(DataState); 839 } else { 840 bufferCharacter(cc); 841 ADVANCE_TO(ScriptDataDoubleEscapedState); 842 } 843 } 844 END_STATE() 845 846 BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) { 847 if (cc == '/') { 848 bufferCharacter(cc); 849 m_temporaryBuffer.clear(); 850 ADVANCE_TO(ScriptDataDoubleEscapeEndState); 851 } else 852 RECONSUME_IN(ScriptDataDoubleEscapedState); 853 } 854 END_STATE() 855 856 BEGIN_STATE(ScriptDataDoubleEscapeEndState) { 857 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') { 858 bufferCharacter(cc); 859 if (temporaryBufferIs(scriptTag.localName())) 860 ADVANCE_TO(ScriptDataEscapedState); 861 else 862 ADVANCE_TO(ScriptDataDoubleEscapedState); 863 } else if (isASCIIUpper(cc)) { 864 bufferCharacter(cc); 865 m_temporaryBuffer.append(toLowerCase(cc)); 866 ADVANCE_TO(ScriptDataDoubleEscapeEndState); 867 } else if (isASCIILower(cc)) { 868 bufferCharacter(cc); 869 m_temporaryBuffer.append(cc); 870 ADVANCE_TO(ScriptDataDoubleEscapeEndState); 871 } else 872 RECONSUME_IN(ScriptDataDoubleEscapedState); 873 } 874 END_STATE() 875 876 BEGIN_STATE(BeforeAttributeNameState) { 877 if (isTokenizerWhitespace(cc)) 878 ADVANCE_TO(BeforeAttributeNameState); 879 else if (cc == '/') 880 ADVANCE_TO(SelfClosingStartTagState); 881 else if (cc == '>') 882 return emitAndResumeIn(source, DataState); 883 else if (m_usePreHTML5ParserQuirks && cc == '<') 884 return emitAndReconsumeIn(source, DataState); 885 else if (isASCIIUpper(cc)) { 886 m_token->addNewAttribute(); 887 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 888 m_token->appendToAttributeName(toLowerCase(cc)); 889 ADVANCE_TO(AttributeNameState); 890 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 891 parseError(); 892 RECONSUME_IN(DataState); 893 } else { 894 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=') 895 parseError(); 896 m_token->addNewAttribute(); 897 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 898 m_token->appendToAttributeName(cc); 899 ADVANCE_TO(AttributeNameState); 900 } 901 } 902 END_STATE() 903 904 BEGIN_STATE(AttributeNameState) { 905 if (isTokenizerWhitespace(cc)) { 906 m_token->endAttributeName(source.numberOfCharactersConsumed()); 907 ADVANCE_TO(AfterAttributeNameState); 908 } else if (cc == '/') { 909 m_token->endAttributeName(source.numberOfCharactersConsumed()); 910 ADVANCE_TO(SelfClosingStartTagState); 911 } else if (cc == '=') { 912 m_token->endAttributeName(source.numberOfCharactersConsumed()); 913 ADVANCE_TO(BeforeAttributeValueState); 914 } else if (cc == '>') { 915 m_token->endAttributeName(source.numberOfCharactersConsumed()); 916 return emitAndResumeIn(source, DataState); 917 } else if (m_usePreHTML5ParserQuirks && cc == '<') { 918 m_token->endAttributeName(source.numberOfCharactersConsumed()); 919 return emitAndReconsumeIn(source, DataState); 920 } else if (isASCIIUpper(cc)) { 921 m_token->appendToAttributeName(toLowerCase(cc)); 922 ADVANCE_TO(AttributeNameState); 923 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 924 parseError(); 925 m_token->endAttributeName(source.numberOfCharactersConsumed()); 926 RECONSUME_IN(DataState); 927 } else { 928 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=') 929 parseError(); 930 m_token->appendToAttributeName(cc); 931 ADVANCE_TO(AttributeNameState); 932 } 933 } 934 END_STATE() 935 936 BEGIN_STATE(AfterAttributeNameState) { 937 if (isTokenizerWhitespace(cc)) 938 ADVANCE_TO(AfterAttributeNameState); 939 else if (cc == '/') 940 ADVANCE_TO(SelfClosingStartTagState); 941 else if (cc == '=') 942 ADVANCE_TO(BeforeAttributeValueState); 943 else if (cc == '>') 944 return emitAndResumeIn(source, DataState); 945 else if (m_usePreHTML5ParserQuirks && cc == '<') 946 return emitAndReconsumeIn(source, DataState); 947 else if (isASCIIUpper(cc)) { 948 m_token->addNewAttribute(); 949 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 950 m_token->appendToAttributeName(toLowerCase(cc)); 951 ADVANCE_TO(AttributeNameState); 952 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 953 parseError(); 954 RECONSUME_IN(DataState); 955 } else { 956 if (cc == '"' || cc == '\'' || cc == '<') 957 parseError(); 958 m_token->addNewAttribute(); 959 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 960 m_token->appendToAttributeName(cc); 961 ADVANCE_TO(AttributeNameState); 962 } 963 } 964 END_STATE() 965 966 BEGIN_STATE(BeforeAttributeValueState) { 967 if (isTokenizerWhitespace(cc)) 968 ADVANCE_TO(BeforeAttributeValueState); 969 else if (cc == '"') { 970 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1); 971 ADVANCE_TO(AttributeValueDoubleQuotedState); 972 } else if (cc == '&') { 973 m_token->beginAttributeValue(source.numberOfCharactersConsumed()); 974 RECONSUME_IN(AttributeValueUnquotedState); 975 } else if (cc == '\'') { 976 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1); 977 ADVANCE_TO(AttributeValueSingleQuotedState); 978 } else if (cc == '>') { 979 parseError(); 980 return emitAndResumeIn(source, DataState); 981 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 982 parseError(); 983 RECONSUME_IN(DataState); 984 } else { 985 if (cc == '<' || cc == '=' || cc == '`') 986 parseError(); 987 m_token->beginAttributeValue(source.numberOfCharactersConsumed()); 988 m_token->appendToAttributeValue(cc); 989 ADVANCE_TO(AttributeValueUnquotedState); 990 } 991 } 992 END_STATE() 993 994 BEGIN_STATE(AttributeValueDoubleQuotedState) { 995 if (cc == '"') { 996 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 997 ADVANCE_TO(AfterAttributeValueQuotedState); 998 } else if (cc == '&') { 999 m_additionalAllowedCharacter = '"'; 1000 ADVANCE_TO(CharacterReferenceInAttributeValueState); 1001 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1002 parseError(); 1003 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 1004 RECONSUME_IN(DataState); 1005 } else { 1006 m_token->appendToAttributeValue(cc); 1007 ADVANCE_TO(AttributeValueDoubleQuotedState); 1008 } 1009 } 1010 END_STATE() 1011 1012 BEGIN_STATE(AttributeValueSingleQuotedState) { 1013 if (cc == '\'') { 1014 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 1015 ADVANCE_TO(AfterAttributeValueQuotedState); 1016 } else if (cc == '&') { 1017 m_additionalAllowedCharacter = '\''; 1018 ADVANCE_TO(CharacterReferenceInAttributeValueState); 1019 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1020 parseError(); 1021 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 1022 RECONSUME_IN(DataState); 1023 } else { 1024 m_token->appendToAttributeValue(cc); 1025 ADVANCE_TO(AttributeValueSingleQuotedState); 1026 } 1027 } 1028 END_STATE() 1029 1030 BEGIN_STATE(AttributeValueUnquotedState) { 1031 if (isTokenizerWhitespace(cc)) { 1032 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 1033 ADVANCE_TO(BeforeAttributeNameState); 1034 } else if (cc == '&') { 1035 m_additionalAllowedCharacter = '>'; 1036 ADVANCE_TO(CharacterReferenceInAttributeValueState); 1037 } else if (cc == '>') { 1038 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 1039 return emitAndResumeIn(source, DataState); 1040 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1041 parseError(); 1042 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 1043 RECONSUME_IN(DataState); 1044 } else { 1045 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`') 1046 parseError(); 1047 m_token->appendToAttributeValue(cc); 1048 ADVANCE_TO(AttributeValueUnquotedState); 1049 } 1050 } 1051 END_STATE() 1052 1053 BEGIN_STATE(CharacterReferenceInAttributeValueState) { 1054 bool notEnoughCharacters = false; 1055 Vector<UChar, 16> decodedEntity; 1056 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter); 1057 if (notEnoughCharacters) 1058 return haveBufferedCharacterToken(); 1059 if (!success) { 1060 ASSERT(decodedEntity.isEmpty()); 1061 m_token->appendToAttributeValue('&'); 1062 } else { 1063 Vector<UChar>::const_iterator iter = decodedEntity.begin(); 1064 for (; iter != decodedEntity.end(); ++iter) 1065 m_token->appendToAttributeValue(*iter); 1066 } 1067 // We're supposed to switch back to the attribute value state that 1068 // we were in when we were switched into this state. Rather than 1069 // keeping track of this explictly, we observe that the previous 1070 // state can be determined by m_additionalAllowedCharacter. 1071 if (m_additionalAllowedCharacter == '"') 1072 SWITCH_TO(AttributeValueDoubleQuotedState); 1073 else if (m_additionalAllowedCharacter == '\'') 1074 SWITCH_TO(AttributeValueSingleQuotedState); 1075 else if (m_additionalAllowedCharacter == '>') 1076 SWITCH_TO(AttributeValueUnquotedState); 1077 else 1078 ASSERT_NOT_REACHED(); 1079 } 1080 END_STATE() 1081 1082 BEGIN_STATE(AfterAttributeValueQuotedState) { 1083 if (isTokenizerWhitespace(cc)) 1084 ADVANCE_TO(BeforeAttributeNameState); 1085 else if (cc == '/') 1086 ADVANCE_TO(SelfClosingStartTagState); 1087 else if (cc == '>') 1088 return emitAndResumeIn(source, DataState); 1089 else if (m_usePreHTML5ParserQuirks && cc == '<') 1090 return emitAndReconsumeIn(source, DataState); 1091 else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1092 parseError(); 1093 RECONSUME_IN(DataState); 1094 } else { 1095 parseError(); 1096 RECONSUME_IN(BeforeAttributeNameState); 1097 } 1098 } 1099 END_STATE() 1100 1101 BEGIN_STATE(SelfClosingStartTagState) { 1102 if (cc == '>') { 1103 m_token->setSelfClosing(); 1104 return emitAndResumeIn(source, DataState); 1105 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1106 parseError(); 1107 RECONSUME_IN(DataState); 1108 } else { 1109 parseError(); 1110 RECONSUME_IN(BeforeAttributeNameState); 1111 } 1112 } 1113 END_STATE() 1114 1115 BEGIN_STATE(BogusCommentState) { 1116 m_token->beginComment(); 1117 RECONSUME_IN(ContinueBogusCommentState); 1118 } 1119 END_STATE() 1120 1121 BEGIN_STATE(ContinueBogusCommentState) { 1122 if (cc == '>') 1123 return emitAndResumeIn(source, DataState); 1124 else if (cc == InputStreamPreprocessor::endOfFileMarker) 1125 return emitAndReconsumeIn(source, DataState); 1126 else { 1127 m_token->appendToComment(cc); 1128 ADVANCE_TO(ContinueBogusCommentState); 1129 } 1130 } 1131 END_STATE() 1132 1133 BEGIN_STATE(MarkupDeclarationOpenState) { 1134 DEFINE_STATIC_LOCAL(String, dashDashString, ("--")); 1135 DEFINE_STATIC_LOCAL(String, doctypeString, ("doctype")); 1136 DEFINE_STATIC_LOCAL(String, cdataString, ("[CDATA[")); 1137 if (cc == '-') { 1138 SegmentedString::LookAheadResult result = source.lookAhead(dashDashString); 1139 if (result == SegmentedString::DidMatch) { 1140 source.advanceAndASSERT('-'); 1141 source.advanceAndASSERT('-'); 1142 m_token->beginComment(); 1143 SWITCH_TO(CommentStartState); 1144 } else if (result == SegmentedString::NotEnoughCharacters) 1145 return haveBufferedCharacterToken(); 1146 } else if (cc == 'D' || cc == 'd') { 1147 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString); 1148 if (result == SegmentedString::DidMatch) { 1149 advanceStringAndASSERTIgnoringCase(source, "doctype"); 1150 SWITCH_TO(DOCTYPEState); 1151 } else if (result == SegmentedString::NotEnoughCharacters) 1152 return haveBufferedCharacterToken(); 1153 } else if (cc == '[' && shouldAllowCDATA()) { 1154 SegmentedString::LookAheadResult result = source.lookAhead(cdataString); 1155 if (result == SegmentedString::DidMatch) { 1156 advanceStringAndASSERT(source, "[CDATA["); 1157 SWITCH_TO(CDATASectionState); 1158 } else if (result == SegmentedString::NotEnoughCharacters) 1159 return haveBufferedCharacterToken(); 1160 } 1161 parseError(); 1162 RECONSUME_IN(BogusCommentState); 1163 } 1164 END_STATE() 1165 1166 BEGIN_STATE(CommentStartState) { 1167 if (cc == '-') 1168 ADVANCE_TO(CommentStartDashState); 1169 else if (cc == '>') { 1170 parseError(); 1171 return emitAndResumeIn(source, DataState); 1172 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1173 parseError(); 1174 return emitAndReconsumeIn(source, DataState); 1175 } else { 1176 m_token->appendToComment(cc); 1177 ADVANCE_TO(CommentState); 1178 } 1179 } 1180 END_STATE() 1181 1182 BEGIN_STATE(CommentStartDashState) { 1183 if (cc == '-') 1184 ADVANCE_TO(CommentEndState); 1185 else if (cc == '>') { 1186 parseError(); 1187 return emitAndResumeIn(source, DataState); 1188 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1189 parseError(); 1190 return emitAndReconsumeIn(source, DataState); 1191 } else { 1192 m_token->appendToComment('-'); 1193 m_token->appendToComment(cc); 1194 ADVANCE_TO(CommentState); 1195 } 1196 } 1197 END_STATE() 1198 1199 BEGIN_STATE(CommentState) { 1200 if (cc == '-') 1201 ADVANCE_TO(CommentEndDashState); 1202 else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1203 parseError(); 1204 return emitAndReconsumeIn(source, DataState); 1205 } else { 1206 m_token->appendToComment(cc); 1207 ADVANCE_TO(CommentState); 1208 } 1209 } 1210 END_STATE() 1211 1212 BEGIN_STATE(CommentEndDashState) { 1213 if (cc == '-') 1214 ADVANCE_TO(CommentEndState); 1215 else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1216 parseError(); 1217 return emitAndReconsumeIn(source, DataState); 1218 } else { 1219 m_token->appendToComment('-'); 1220 m_token->appendToComment(cc); 1221 ADVANCE_TO(CommentState); 1222 } 1223 } 1224 END_STATE() 1225 1226 BEGIN_STATE(CommentEndState) { 1227 if (cc == '>') 1228 return emitAndResumeIn(source, DataState); 1229 else if (cc == '!') { 1230 parseError(); 1231 ADVANCE_TO(CommentEndBangState); 1232 } else if (cc == '-') { 1233 parseError(); 1234 m_token->appendToComment('-'); 1235 ADVANCE_TO(CommentEndState); 1236 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1237 parseError(); 1238 return emitAndReconsumeIn(source, DataState); 1239 } else { 1240 parseError(); 1241 m_token->appendToComment('-'); 1242 m_token->appendToComment('-'); 1243 m_token->appendToComment(cc); 1244 ADVANCE_TO(CommentState); 1245 } 1246 } 1247 END_STATE() 1248 1249 BEGIN_STATE(CommentEndBangState) { 1250 if (cc == '-') { 1251 m_token->appendToComment('-'); 1252 m_token->appendToComment('-'); 1253 m_token->appendToComment('!'); 1254 ADVANCE_TO(CommentEndDashState); 1255 } else if (cc == '>') 1256 return emitAndResumeIn(source, DataState); 1257 else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1258 parseError(); 1259 return emitAndReconsumeIn(source, DataState); 1260 } else { 1261 m_token->appendToComment('-'); 1262 m_token->appendToComment('-'); 1263 m_token->appendToComment('!'); 1264 m_token->appendToComment(cc); 1265 ADVANCE_TO(CommentState); 1266 } 1267 } 1268 END_STATE() 1269 1270 BEGIN_STATE(DOCTYPEState) { 1271 if (isTokenizerWhitespace(cc)) 1272 ADVANCE_TO(BeforeDOCTYPENameState); 1273 else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1274 parseError(); 1275 m_token->beginDOCTYPE(); 1276 m_token->setForceQuirks(); 1277 return emitAndReconsumeIn(source, DataState); 1278 } else { 1279 parseError(); 1280 RECONSUME_IN(BeforeDOCTYPENameState); 1281 } 1282 } 1283 END_STATE() 1284 1285 BEGIN_STATE(BeforeDOCTYPENameState) { 1286 if (isTokenizerWhitespace(cc)) 1287 ADVANCE_TO(BeforeDOCTYPENameState); 1288 else if (isASCIIUpper(cc)) { 1289 m_token->beginDOCTYPE(toLowerCase(cc)); 1290 ADVANCE_TO(DOCTYPENameState); 1291 } else if (cc == '>') { 1292 parseError(); 1293 m_token->beginDOCTYPE(); 1294 m_token->setForceQuirks(); 1295 return emitAndResumeIn(source, DataState); 1296 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1297 parseError(); 1298 m_token->beginDOCTYPE(); 1299 m_token->setForceQuirks(); 1300 return emitAndReconsumeIn(source, DataState); 1301 } else { 1302 m_token->beginDOCTYPE(cc); 1303 ADVANCE_TO(DOCTYPENameState); 1304 } 1305 } 1306 END_STATE() 1307 1308 BEGIN_STATE(DOCTYPENameState) { 1309 if (isTokenizerWhitespace(cc)) 1310 ADVANCE_TO(AfterDOCTYPENameState); 1311 else if (cc == '>') 1312 return emitAndResumeIn(source, DataState); 1313 else if (isASCIIUpper(cc)) { 1314 m_token->appendToName(toLowerCase(cc)); 1315 ADVANCE_TO(DOCTYPENameState); 1316 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1317 parseError(); 1318 m_token->setForceQuirks(); 1319 return emitAndReconsumeIn(source, DataState); 1320 } else { 1321 m_token->appendToName(cc); 1322 ADVANCE_TO(DOCTYPENameState); 1323 } 1324 } 1325 END_STATE() 1326 1327 BEGIN_STATE(AfterDOCTYPENameState) { 1328 if (isTokenizerWhitespace(cc)) 1329 ADVANCE_TO(AfterDOCTYPENameState); 1330 if (cc == '>') 1331 return emitAndResumeIn(source, DataState); 1332 else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1333 parseError(); 1334 m_token->setForceQuirks(); 1335 return emitAndReconsumeIn(source, DataState); 1336 } else { 1337 DEFINE_STATIC_LOCAL(String, publicString, ("public")); 1338 DEFINE_STATIC_LOCAL(String, systemString, ("system")); 1339 if (cc == 'P' || cc == 'p') { 1340 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString); 1341 if (result == SegmentedString::DidMatch) { 1342 advanceStringAndASSERTIgnoringCase(source, "public"); 1343 SWITCH_TO(AfterDOCTYPEPublicKeywordState); 1344 } else if (result == SegmentedString::NotEnoughCharacters) 1345 return haveBufferedCharacterToken(); 1346 } else if (cc == 'S' || cc == 's') { 1347 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString); 1348 if (result == SegmentedString::DidMatch) { 1349 advanceStringAndASSERTIgnoringCase(source, "system"); 1350 SWITCH_TO(AfterDOCTYPESystemKeywordState); 1351 } else if (result == SegmentedString::NotEnoughCharacters) 1352 return haveBufferedCharacterToken(); 1353 } 1354 parseError(); 1355 m_token->setForceQuirks(); 1356 ADVANCE_TO(BogusDOCTYPEState); 1357 } 1358 } 1359 END_STATE() 1360 1361 BEGIN_STATE(AfterDOCTYPEPublicKeywordState) { 1362 if (isTokenizerWhitespace(cc)) 1363 ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState); 1364 else if (cc == '"') { 1365 parseError(); 1366 m_token->setPublicIdentifierToEmptyString(); 1367 ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1368 } else if (cc == '\'') { 1369 parseError(); 1370 m_token->setPublicIdentifierToEmptyString(); 1371 ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1372 } else if (cc == '>') { 1373 parseError(); 1374 m_token->setForceQuirks(); 1375 return emitAndResumeIn(source, DataState); 1376 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1377 parseError(); 1378 m_token->setForceQuirks(); 1379 return emitAndReconsumeIn(source, DataState); 1380 } else { 1381 parseError(); 1382 m_token->setForceQuirks(); 1383 ADVANCE_TO(BogusDOCTYPEState); 1384 } 1385 } 1386 END_STATE() 1387 1388 BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) { 1389 if (isTokenizerWhitespace(cc)) 1390 ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState); 1391 else if (cc == '"') { 1392 m_token->setPublicIdentifierToEmptyString(); 1393 ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1394 } else if (cc == '\'') { 1395 m_token->setPublicIdentifierToEmptyString(); 1396 ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1397 } else if (cc == '>') { 1398 parseError(); 1399 m_token->setForceQuirks(); 1400 return emitAndResumeIn(source, DataState); 1401 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1402 parseError(); 1403 m_token->setForceQuirks(); 1404 return emitAndReconsumeIn(source, DataState); 1405 } else { 1406 parseError(); 1407 m_token->setForceQuirks(); 1408 ADVANCE_TO(BogusDOCTYPEState); 1409 } 1410 } 1411 END_STATE() 1412 1413 BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) { 1414 if (cc == '"') 1415 ADVANCE_TO(AfterDOCTYPEPublicIdentifierState); 1416 else if (cc == '>') { 1417 parseError(); 1418 m_token->setForceQuirks(); 1419 return emitAndResumeIn(source, DataState); 1420 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1421 parseError(); 1422 m_token->setForceQuirks(); 1423 return emitAndReconsumeIn(source, DataState); 1424 } else { 1425 m_token->appendToPublicIdentifier(cc); 1426 ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1427 } 1428 } 1429 END_STATE() 1430 1431 BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) { 1432 if (cc == '\'') 1433 ADVANCE_TO(AfterDOCTYPEPublicIdentifierState); 1434 else if (cc == '>') { 1435 parseError(); 1436 m_token->setForceQuirks(); 1437 return emitAndResumeIn(source, DataState); 1438 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1439 parseError(); 1440 m_token->setForceQuirks(); 1441 return emitAndReconsumeIn(source, DataState); 1442 } else { 1443 m_token->appendToPublicIdentifier(cc); 1444 ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1445 } 1446 } 1447 END_STATE() 1448 1449 BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) { 1450 if (isTokenizerWhitespace(cc)) 1451 ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState); 1452 else if (cc == '>') 1453 return emitAndResumeIn(source, DataState); 1454 else if (cc == '"') { 1455 parseError(); 1456 m_token->setSystemIdentifierToEmptyString(); 1457 ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1458 } else if (cc == '\'') { 1459 parseError(); 1460 m_token->setSystemIdentifierToEmptyString(); 1461 ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1462 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1463 parseError(); 1464 m_token->setForceQuirks(); 1465 return emitAndReconsumeIn(source, DataState); 1466 } else { 1467 parseError(); 1468 m_token->setForceQuirks(); 1469 ADVANCE_TO(BogusDOCTYPEState); 1470 } 1471 } 1472 END_STATE() 1473 1474 BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) { 1475 if (isTokenizerWhitespace(cc)) 1476 ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState); 1477 else if (cc == '>') 1478 return emitAndResumeIn(source, DataState); 1479 else if (cc == '"') { 1480 m_token->setSystemIdentifierToEmptyString(); 1481 ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1482 } else if (cc == '\'') { 1483 m_token->setSystemIdentifierToEmptyString(); 1484 ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1485 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1486 parseError(); 1487 m_token->setForceQuirks(); 1488 return emitAndReconsumeIn(source, DataState); 1489 } else { 1490 parseError(); 1491 m_token->setForceQuirks(); 1492 ADVANCE_TO(BogusDOCTYPEState); 1493 } 1494 } 1495 END_STATE() 1496 1497 BEGIN_STATE(AfterDOCTYPESystemKeywordState) { 1498 if (isTokenizerWhitespace(cc)) 1499 ADVANCE_TO(BeforeDOCTYPESystemIdentifierState); 1500 else if (cc == '"') { 1501 parseError(); 1502 m_token->setSystemIdentifierToEmptyString(); 1503 ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1504 } else if (cc == '\'') { 1505 parseError(); 1506 m_token->setSystemIdentifierToEmptyString(); 1507 ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1508 } else if (cc == '>') { 1509 parseError(); 1510 m_token->setForceQuirks(); 1511 return emitAndResumeIn(source, DataState); 1512 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1513 parseError(); 1514 m_token->setForceQuirks(); 1515 return emitAndReconsumeIn(source, DataState); 1516 } else { 1517 parseError(); 1518 m_token->setForceQuirks(); 1519 ADVANCE_TO(BogusDOCTYPEState); 1520 } 1521 } 1522 END_STATE() 1523 1524 BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) { 1525 if (isTokenizerWhitespace(cc)) 1526 ADVANCE_TO(BeforeDOCTYPESystemIdentifierState); 1527 if (cc == '"') { 1528 m_token->setSystemIdentifierToEmptyString(); 1529 ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1530 } else if (cc == '\'') { 1531 m_token->setSystemIdentifierToEmptyString(); 1532 ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1533 } else if (cc == '>') { 1534 parseError(); 1535 m_token->setForceQuirks(); 1536 return emitAndResumeIn(source, DataState); 1537 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1538 parseError(); 1539 m_token->setForceQuirks(); 1540 return emitAndReconsumeIn(source, DataState); 1541 } else { 1542 parseError(); 1543 m_token->setForceQuirks(); 1544 ADVANCE_TO(BogusDOCTYPEState); 1545 } 1546 } 1547 END_STATE() 1548 1549 BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) { 1550 if (cc == '"') 1551 ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1552 else if (cc == '>') { 1553 parseError(); 1554 m_token->setForceQuirks(); 1555 return emitAndResumeIn(source, DataState); 1556 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1557 parseError(); 1558 m_token->setForceQuirks(); 1559 return emitAndReconsumeIn(source, DataState); 1560 } else { 1561 m_token->appendToSystemIdentifier(cc); 1562 ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1563 } 1564 } 1565 END_STATE() 1566 1567 BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) { 1568 if (cc == '\'') 1569 ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1570 else if (cc == '>') { 1571 parseError(); 1572 m_token->setForceQuirks(); 1573 return emitAndResumeIn(source, DataState); 1574 } else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1575 parseError(); 1576 m_token->setForceQuirks(); 1577 return emitAndReconsumeIn(source, DataState); 1578 } else { 1579 m_token->appendToSystemIdentifier(cc); 1580 ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1581 } 1582 } 1583 END_STATE() 1584 1585 BEGIN_STATE(AfterDOCTYPESystemIdentifierState) { 1586 if (isTokenizerWhitespace(cc)) 1587 ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1588 else if (cc == '>') 1589 return emitAndResumeIn(source, DataState); 1590 else if (cc == InputStreamPreprocessor::endOfFileMarker) { 1591 parseError(); 1592 m_token->setForceQuirks(); 1593 return emitAndReconsumeIn(source, DataState); 1594 } else { 1595 parseError(); 1596 ADVANCE_TO(BogusDOCTYPEState); 1597 } 1598 } 1599 END_STATE() 1600 1601 BEGIN_STATE(BogusDOCTYPEState) { 1602 if (cc == '>') 1603 return emitAndResumeIn(source, DataState); 1604 else if (cc == InputStreamPreprocessor::endOfFileMarker) 1605 return emitAndReconsumeIn(source, DataState); 1606 ADVANCE_TO(BogusDOCTYPEState); 1607 } 1608 END_STATE() 1609 1610 BEGIN_STATE(CDATASectionState) { 1611 if (cc == ']') 1612 ADVANCE_TO(CDATASectionRightSquareBracketState); 1613 else if (cc == InputStreamPreprocessor::endOfFileMarker) 1614 RECONSUME_IN(DataState); 1615 else { 1616 bufferCharacter(cc); 1617 ADVANCE_TO(CDATASectionState); 1618 } 1619 } 1620 END_STATE() 1621 1622 BEGIN_STATE(CDATASectionRightSquareBracketState) { 1623 if (cc == ']') 1624 ADVANCE_TO(CDATASectionDoubleRightSquareBracketState); 1625 else { 1626 bufferCharacter(']'); 1627 RECONSUME_IN(CDATASectionState); 1628 } 1629 } 1630 1631 BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) { 1632 if (cc == '>') 1633 ADVANCE_TO(DataState); 1634 else { 1635 bufferCharacter(']'); 1636 bufferCharacter(']'); 1637 RECONSUME_IN(CDATASectionState); 1638 } 1639 } 1640 END_STATE() 1641 1642 } 1643 1644 ASSERT_NOT_REACHED(); 1645 return false; 1646 } 1647 1648 void HTMLTokenizer::updateStateFor(const AtomicString& tagName, Frame* frame) 1649 { 1650 if (tagName == textareaTag || tagName == titleTag) 1651 setState(RCDATAState); 1652 else if (tagName == plaintextTag) 1653 setState(PLAINTEXTState); 1654 else if (tagName == scriptTag) 1655 setState(ScriptDataState); 1656 else if (tagName == styleTag 1657 || tagName == iframeTag 1658 || tagName == xmpTag 1659 || (tagName == noembedTag && HTMLTreeBuilder::pluginsEnabled(frame)) 1660 || tagName == noframesTag 1661 || (tagName == noscriptTag && HTMLTreeBuilder::scriptEnabled(frame))) 1662 setState(RAWTEXTState); 1663 } 1664 1665 inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString) 1666 { 1667 return vectorEqualsString(m_temporaryBuffer, expectedString); 1668 } 1669 1670 inline void HTMLTokenizer::addToPossibleEndTag(UChar cc) 1671 { 1672 ASSERT(isEndTagBufferingState(m_state)); 1673 m_bufferedEndTagName.append(cc); 1674 } 1675 1676 inline bool HTMLTokenizer::isAppropriateEndTag() 1677 { 1678 return m_bufferedEndTagName == m_appropriateEndTagName; 1679 } 1680 1681 inline void HTMLTokenizer::bufferCharacter(UChar character) 1682 { 1683 ASSERT(character != InputStreamPreprocessor::endOfFileMarker); 1684 m_token->ensureIsCharacterToken(); 1685 m_token->appendToCharacter(character); 1686 } 1687 1688 inline void HTMLTokenizer::parseError() 1689 { 1690 notImplemented(); 1691 } 1692 1693 inline bool HTMLTokenizer::haveBufferedCharacterToken() 1694 { 1695 return m_token->type() == HTMLToken::Character; 1696 } 1697 1698 } 1699