1 /* 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ 4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 #include "core/html/parser/HTMLTokenizer.h" 30 31 #include "core/HTMLNames.h" 32 #include "core/HTMLTokenizerNames.h" 33 #include "core/html/parser/HTMLEntityParser.h" 34 #include "core/html/parser/HTMLParserIdioms.h" 35 #include "core/html/parser/HTMLTreeBuilder.h" 36 #include "platform/NotImplemented.h" 37 #include "core/xml/parser/MarkupTokenizerInlines.h" 38 #include "wtf/ASCIICType.h" 39 #include "wtf/text/AtomicString.h" 40 #include "wtf/unicode/Unicode.h" 41 42 // Please don't use DEFINE_STATIC_LOCAL in this file. The HTMLTokenizer is used 43 // from multiple threads and DEFINE_STATIC_LOCAL isn't threadsafe. 44 #undef DEFINE_STATIC_LOCAL 45 46 namespace blink { 47 48 using namespace HTMLNames; 49 50 // This has to go in a .cpp file, as the linker doesn't like it being included more than once. 51 // We don't have an HTMLToken.cpp though, so this is the next best place. 52 QualifiedName AtomicHTMLToken::nameForAttribute(const HTMLToken::Attribute& attribute) const 53 { 54 return QualifiedName(nullAtom, AtomicString(attribute.name), nullAtom); 55 } 56 57 bool AtomicHTMLToken::usesName() const 58 { 59 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE; 60 } 61 62 bool AtomicHTMLToken::usesAttributes() const 63 { 64 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag; 65 } 66 67 static inline UChar toLowerCase(UChar cc) 68 { 69 ASSERT(isASCIIUpper(cc)); 70 const int lowerCaseOffset = 0x20; 71 return cc + lowerCaseOffset; 72 } 73 74 static inline bool vectorEqualsString(const Vector<LChar, 32>& vector, const String& string) 75 { 76 if (vector.size() != string.length()) 77 return false; 78 79 if (!string.length()) 80 return true; 81 82 return equal(string.impl(), vector.data(), vector.size()); 83 } 84 85 static inline bool isEndTagBufferingState(HTMLTokenizer::State state) 86 { 87 switch (state) { 88 case HTMLTokenizer::RCDATAEndTagOpenState: 89 case HTMLTokenizer::RCDATAEndTagNameState: 90 case HTMLTokenizer::RAWTEXTEndTagOpenState: 91 case HTMLTokenizer::RAWTEXTEndTagNameState: 92 case HTMLTokenizer::ScriptDataEndTagOpenState: 93 case HTMLTokenizer::ScriptDataEndTagNameState: 94 case HTMLTokenizer::ScriptDataEscapedEndTagOpenState: 95 case HTMLTokenizer::ScriptDataEscapedEndTagNameState: 96 return true; 97 default: 98 return false; 99 } 100 } 101 102 #define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizer, stateName) 103 #define HTML_RECONSUME_IN(stateName) RECONSUME_IN(HTMLTokenizer, stateName) 104 #define HTML_ADVANCE_TO(stateName) ADVANCE_TO(HTMLTokenizer, stateName) 105 #define HTML_SWITCH_TO(stateName) SWITCH_TO(HTMLTokenizer, stateName) 106 107 HTMLTokenizer::HTMLTokenizer(const HTMLParserOptions& options) 108 : m_inputStreamPreprocessor(this) 109 , m_options(options) 110 { 111 reset(); 112 } 113 114 HTMLTokenizer::~HTMLTokenizer() 115 { 116 } 117 118 void HTMLTokenizer::reset() 119 { 120 m_state = HTMLTokenizer::DataState; 121 m_token = 0; 122 m_forceNullCharacterReplacement = false; 123 m_shouldAllowCDATA = false; 124 m_additionalAllowedCharacter = '\0'; 125 } 126 127 bool HTMLTokenizer::canCreateCheckpoint() const 128 { 129 if (!m_appropriateEndTagName.isEmpty()) 130 return false; 131 if (!m_temporaryBuffer.isEmpty()) 132 return false; 133 if (!m_bufferedEndTagName.isEmpty()) 134 return false; 135 return true; 136 } 137 138 void HTMLTokenizer::createCheckpoint(Checkpoint& result) const 139 { 140 ASSERT(canCreateCheckpoint()); 141 result.options = m_options; 142 result.state = m_state; 143 result.additionalAllowedCharacter = m_additionalAllowedCharacter; 144 result.skipNextNewLine = m_inputStreamPreprocessor.skipNextNewLine(); 145 result.shouldAllowCDATA = m_shouldAllowCDATA; 146 } 147 148 void HTMLTokenizer::restoreFromCheckpoint(const Checkpoint& checkpoint) 149 { 150 m_token = 0; 151 m_options = checkpoint.options; 152 m_state = checkpoint.state; 153 m_additionalAllowedCharacter = checkpoint.additionalAllowedCharacter; 154 m_inputStreamPreprocessor.reset(checkpoint.skipNextNewLine); 155 m_shouldAllowCDATA = checkpoint.shouldAllowCDATA; 156 } 157 158 inline bool HTMLTokenizer::processEntity(SegmentedString& source) 159 { 160 bool notEnoughCharacters = false; 161 DecodedHTMLEntity decodedEntity; 162 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters); 163 if (notEnoughCharacters) 164 return false; 165 if (!success) { 166 ASSERT(decodedEntity.isEmpty()); 167 bufferCharacter('&'); 168 } else { 169 for (unsigned i = 0; i < decodedEntity.length; ++i) 170 bufferCharacter(decodedEntity.data[i]); 171 } 172 return true; 173 } 174 175 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source) 176 { 177 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized); 178 source.advanceAndUpdateLineNumber(); 179 if (m_token->type() == HTMLToken::Character) 180 return true; 181 m_token->beginEndTag(m_bufferedEndTagName); 182 m_bufferedEndTagName.clear(); 183 m_appropriateEndTagName.clear(); 184 m_temporaryBuffer.clear(); 185 return false; 186 } 187 188 #define FLUSH_AND_ADVANCE_TO(stateName) \ 189 do { \ 190 m_state = HTMLTokenizer::stateName; \ 191 if (flushBufferedEndTag(source)) \ 192 return true; \ 193 if (source.isEmpty() \ 194 || !m_inputStreamPreprocessor.peek(source)) \ 195 return haveBufferedCharacterToken(); \ 196 cc = m_inputStreamPreprocessor.nextInputCharacter(); \ 197 goto stateName; \ 198 } while (false) 199 200 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer::State state) 201 { 202 m_state = state; 203 flushBufferedEndTag(source); 204 return true; 205 } 206 207 bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token) 208 { 209 // If we have a token in progress, then we're supposed to be called back 210 // with the same token so we can finish it. 211 ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized); 212 m_token = &token; 213 214 if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) { 215 // FIXME: This should call flushBufferedEndTag(). 216 // We started an end tag during our last iteration. 217 m_token->beginEndTag(m_bufferedEndTagName); 218 m_bufferedEndTagName.clear(); 219 m_appropriateEndTagName.clear(); 220 m_temporaryBuffer.clear(); 221 if (m_state == HTMLTokenizer::DataState) { 222 // We're back in the data state, so we must be done with the tag. 223 return true; 224 } 225 } 226 227 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source)) 228 return haveBufferedCharacterToken(); 229 UChar cc = m_inputStreamPreprocessor.nextInputCharacter(); 230 231 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0 232 switch (m_state) { 233 HTML_BEGIN_STATE(DataState) { 234 if (cc == '&') 235 HTML_ADVANCE_TO(CharacterReferenceInDataState); 236 else if (cc == '<') { 237 if (m_token->type() == HTMLToken::Character) { 238 // We have a bunch of character tokens queued up that we 239 // are emitting lazily here. 240 return true; 241 } 242 HTML_ADVANCE_TO(TagOpenState); 243 } else if (cc == kEndOfFileMarker) 244 return emitEndOfFile(source); 245 else { 246 bufferCharacter(cc); 247 HTML_ADVANCE_TO(DataState); 248 } 249 } 250 END_STATE() 251 252 HTML_BEGIN_STATE(CharacterReferenceInDataState) { 253 if (!processEntity(source)) 254 return haveBufferedCharacterToken(); 255 HTML_SWITCH_TO(DataState); 256 } 257 END_STATE() 258 259 HTML_BEGIN_STATE(RCDATAState) { 260 if (cc == '&') 261 HTML_ADVANCE_TO(CharacterReferenceInRCDATAState); 262 else if (cc == '<') 263 HTML_ADVANCE_TO(RCDATALessThanSignState); 264 else if (cc == kEndOfFileMarker) 265 return emitEndOfFile(source); 266 else { 267 bufferCharacter(cc); 268 HTML_ADVANCE_TO(RCDATAState); 269 } 270 } 271 END_STATE() 272 273 HTML_BEGIN_STATE(CharacterReferenceInRCDATAState) { 274 if (!processEntity(source)) 275 return haveBufferedCharacterToken(); 276 HTML_SWITCH_TO(RCDATAState); 277 } 278 END_STATE() 279 280 HTML_BEGIN_STATE(RAWTEXTState) { 281 if (cc == '<') 282 HTML_ADVANCE_TO(RAWTEXTLessThanSignState); 283 else if (cc == kEndOfFileMarker) 284 return emitEndOfFile(source); 285 else { 286 bufferCharacter(cc); 287 HTML_ADVANCE_TO(RAWTEXTState); 288 } 289 } 290 END_STATE() 291 292 HTML_BEGIN_STATE(ScriptDataState) { 293 if (cc == '<') 294 HTML_ADVANCE_TO(ScriptDataLessThanSignState); 295 else if (cc == kEndOfFileMarker) 296 return emitEndOfFile(source); 297 else { 298 bufferCharacter(cc); 299 HTML_ADVANCE_TO(ScriptDataState); 300 } 301 } 302 END_STATE() 303 304 HTML_BEGIN_STATE(PLAINTEXTState) { 305 if (cc == kEndOfFileMarker) 306 return emitEndOfFile(source); 307 bufferCharacter(cc); 308 HTML_ADVANCE_TO(PLAINTEXTState); 309 } 310 END_STATE() 311 312 HTML_BEGIN_STATE(TagOpenState) { 313 if (cc == '!') 314 HTML_ADVANCE_TO(MarkupDeclarationOpenState); 315 else if (cc == '/') 316 HTML_ADVANCE_TO(EndTagOpenState); 317 else if (isASCIIUpper(cc)) { 318 m_token->beginStartTag(toLowerCase(cc)); 319 HTML_ADVANCE_TO(TagNameState); 320 } else if (isASCIILower(cc)) { 321 m_token->beginStartTag(cc); 322 HTML_ADVANCE_TO(TagNameState); 323 } else if (cc == '?') { 324 parseError(); 325 // The spec consumes the current character before switching 326 // to the bogus comment state, but it's easier to implement 327 // if we reconsume the current character. 328 HTML_RECONSUME_IN(BogusCommentState); 329 } else { 330 parseError(); 331 bufferCharacter('<'); 332 HTML_RECONSUME_IN(DataState); 333 } 334 } 335 END_STATE() 336 337 HTML_BEGIN_STATE(EndTagOpenState) { 338 if (isASCIIUpper(cc)) { 339 m_token->beginEndTag(static_cast<LChar>(toLowerCase(cc))); 340 m_appropriateEndTagName.clear(); 341 HTML_ADVANCE_TO(TagNameState); 342 } else if (isASCIILower(cc)) { 343 m_token->beginEndTag(static_cast<LChar>(cc)); 344 m_appropriateEndTagName.clear(); 345 HTML_ADVANCE_TO(TagNameState); 346 } else if (cc == '>') { 347 parseError(); 348 HTML_ADVANCE_TO(DataState); 349 } else if (cc == kEndOfFileMarker) { 350 parseError(); 351 bufferCharacter('<'); 352 bufferCharacter('/'); 353 HTML_RECONSUME_IN(DataState); 354 } else { 355 parseError(); 356 HTML_RECONSUME_IN(BogusCommentState); 357 } 358 } 359 END_STATE() 360 361 HTML_BEGIN_STATE(TagNameState) { 362 if (isTokenizerWhitespace(cc)) 363 HTML_ADVANCE_TO(BeforeAttributeNameState); 364 else if (cc == '/') 365 HTML_ADVANCE_TO(SelfClosingStartTagState); 366 else if (cc == '>') 367 return emitAndResumeIn(source, HTMLTokenizer::DataState); 368 else if (isASCIIUpper(cc)) { 369 m_token->appendToName(toLowerCase(cc)); 370 HTML_ADVANCE_TO(TagNameState); 371 } else if (cc == kEndOfFileMarker) { 372 parseError(); 373 HTML_RECONSUME_IN(DataState); 374 } else { 375 m_token->appendToName(cc); 376 HTML_ADVANCE_TO(TagNameState); 377 } 378 } 379 END_STATE() 380 381 HTML_BEGIN_STATE(RCDATALessThanSignState) { 382 if (cc == '/') { 383 m_temporaryBuffer.clear(); 384 ASSERT(m_bufferedEndTagName.isEmpty()); 385 HTML_ADVANCE_TO(RCDATAEndTagOpenState); 386 } else { 387 bufferCharacter('<'); 388 HTML_RECONSUME_IN(RCDATAState); 389 } 390 } 391 END_STATE() 392 393 HTML_BEGIN_STATE(RCDATAEndTagOpenState) { 394 if (isASCIIUpper(cc)) { 395 m_temporaryBuffer.append(static_cast<LChar>(cc)); 396 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 397 HTML_ADVANCE_TO(RCDATAEndTagNameState); 398 } else if (isASCIILower(cc)) { 399 m_temporaryBuffer.append(static_cast<LChar>(cc)); 400 addToPossibleEndTag(static_cast<LChar>(cc)); 401 HTML_ADVANCE_TO(RCDATAEndTagNameState); 402 } else { 403 bufferCharacter('<'); 404 bufferCharacter('/'); 405 HTML_RECONSUME_IN(RCDATAState); 406 } 407 } 408 END_STATE() 409 410 HTML_BEGIN_STATE(RCDATAEndTagNameState) { 411 if (isASCIIUpper(cc)) { 412 m_temporaryBuffer.append(static_cast<LChar>(cc)); 413 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 414 HTML_ADVANCE_TO(RCDATAEndTagNameState); 415 } else if (isASCIILower(cc)) { 416 m_temporaryBuffer.append(static_cast<LChar>(cc)); 417 addToPossibleEndTag(static_cast<LChar>(cc)); 418 HTML_ADVANCE_TO(RCDATAEndTagNameState); 419 } else { 420 if (isTokenizerWhitespace(cc)) { 421 if (isAppropriateEndTag()) { 422 m_temporaryBuffer.append(static_cast<LChar>(cc)); 423 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 424 } 425 } else if (cc == '/') { 426 if (isAppropriateEndTag()) { 427 m_temporaryBuffer.append(static_cast<LChar>(cc)); 428 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 429 } 430 } else if (cc == '>') { 431 if (isAppropriateEndTag()) { 432 m_temporaryBuffer.append(static_cast<LChar>(cc)); 433 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 434 } 435 } 436 bufferCharacter('<'); 437 bufferCharacter('/'); 438 m_token->appendToCharacter(m_temporaryBuffer); 439 m_bufferedEndTagName.clear(); 440 m_temporaryBuffer.clear(); 441 HTML_RECONSUME_IN(RCDATAState); 442 } 443 } 444 END_STATE() 445 446 HTML_BEGIN_STATE(RAWTEXTLessThanSignState) { 447 if (cc == '/') { 448 m_temporaryBuffer.clear(); 449 ASSERT(m_bufferedEndTagName.isEmpty()); 450 HTML_ADVANCE_TO(RAWTEXTEndTagOpenState); 451 } else { 452 bufferCharacter('<'); 453 HTML_RECONSUME_IN(RAWTEXTState); 454 } 455 } 456 END_STATE() 457 458 HTML_BEGIN_STATE(RAWTEXTEndTagOpenState) { 459 if (isASCIIUpper(cc)) { 460 m_temporaryBuffer.append(static_cast<LChar>(cc)); 461 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 462 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 463 } else if (isASCIILower(cc)) { 464 m_temporaryBuffer.append(static_cast<LChar>(cc)); 465 addToPossibleEndTag(static_cast<LChar>(cc)); 466 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 467 } else { 468 bufferCharacter('<'); 469 bufferCharacter('/'); 470 HTML_RECONSUME_IN(RAWTEXTState); 471 } 472 } 473 END_STATE() 474 475 HTML_BEGIN_STATE(RAWTEXTEndTagNameState) { 476 if (isASCIIUpper(cc)) { 477 m_temporaryBuffer.append(static_cast<LChar>(cc)); 478 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 479 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 480 } else if (isASCIILower(cc)) { 481 m_temporaryBuffer.append(static_cast<LChar>(cc)); 482 addToPossibleEndTag(static_cast<LChar>(cc)); 483 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 484 } else { 485 if (isTokenizerWhitespace(cc)) { 486 if (isAppropriateEndTag()) { 487 m_temporaryBuffer.append(static_cast<LChar>(cc)); 488 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 489 } 490 } else if (cc == '/') { 491 if (isAppropriateEndTag()) { 492 m_temporaryBuffer.append(static_cast<LChar>(cc)); 493 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 494 } 495 } else if (cc == '>') { 496 if (isAppropriateEndTag()) { 497 m_temporaryBuffer.append(static_cast<LChar>(cc)); 498 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 499 } 500 } 501 bufferCharacter('<'); 502 bufferCharacter('/'); 503 m_token->appendToCharacter(m_temporaryBuffer); 504 m_bufferedEndTagName.clear(); 505 m_temporaryBuffer.clear(); 506 HTML_RECONSUME_IN(RAWTEXTState); 507 } 508 } 509 END_STATE() 510 511 HTML_BEGIN_STATE(ScriptDataLessThanSignState) { 512 if (cc == '/') { 513 m_temporaryBuffer.clear(); 514 ASSERT(m_bufferedEndTagName.isEmpty()); 515 HTML_ADVANCE_TO(ScriptDataEndTagOpenState); 516 } else if (cc == '!') { 517 bufferCharacter('<'); 518 bufferCharacter('!'); 519 HTML_ADVANCE_TO(ScriptDataEscapeStartState); 520 } else { 521 bufferCharacter('<'); 522 HTML_RECONSUME_IN(ScriptDataState); 523 } 524 } 525 END_STATE() 526 527 HTML_BEGIN_STATE(ScriptDataEndTagOpenState) { 528 if (isASCIIUpper(cc)) { 529 m_temporaryBuffer.append(static_cast<LChar>(cc)); 530 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 531 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 532 } else if (isASCIILower(cc)) { 533 m_temporaryBuffer.append(static_cast<LChar>(cc)); 534 addToPossibleEndTag(static_cast<LChar>(cc)); 535 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 536 } else { 537 bufferCharacter('<'); 538 bufferCharacter('/'); 539 HTML_RECONSUME_IN(ScriptDataState); 540 } 541 } 542 END_STATE() 543 544 HTML_BEGIN_STATE(ScriptDataEndTagNameState) { 545 if (isASCIIUpper(cc)) { 546 m_temporaryBuffer.append(static_cast<LChar>(cc)); 547 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 548 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 549 } else if (isASCIILower(cc)) { 550 m_temporaryBuffer.append(static_cast<LChar>(cc)); 551 addToPossibleEndTag(static_cast<LChar>(cc)); 552 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 553 } else { 554 if (isTokenizerWhitespace(cc)) { 555 if (isAppropriateEndTag()) { 556 m_temporaryBuffer.append(static_cast<LChar>(cc)); 557 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 558 } 559 } else if (cc == '/') { 560 if (isAppropriateEndTag()) { 561 m_temporaryBuffer.append(static_cast<LChar>(cc)); 562 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 563 } 564 } else if (cc == '>') { 565 if (isAppropriateEndTag()) { 566 m_temporaryBuffer.append(static_cast<LChar>(cc)); 567 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 568 } 569 } 570 bufferCharacter('<'); 571 bufferCharacter('/'); 572 m_token->appendToCharacter(m_temporaryBuffer); 573 m_bufferedEndTagName.clear(); 574 m_temporaryBuffer.clear(); 575 HTML_RECONSUME_IN(ScriptDataState); 576 } 577 } 578 END_STATE() 579 580 HTML_BEGIN_STATE(ScriptDataEscapeStartState) { 581 if (cc == '-') { 582 bufferCharacter(cc); 583 HTML_ADVANCE_TO(ScriptDataEscapeStartDashState); 584 } else 585 HTML_RECONSUME_IN(ScriptDataState); 586 } 587 END_STATE() 588 589 HTML_BEGIN_STATE(ScriptDataEscapeStartDashState) { 590 if (cc == '-') { 591 bufferCharacter(cc); 592 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState); 593 } else 594 HTML_RECONSUME_IN(ScriptDataState); 595 } 596 END_STATE() 597 598 HTML_BEGIN_STATE(ScriptDataEscapedState) { 599 if (cc == '-') { 600 bufferCharacter(cc); 601 HTML_ADVANCE_TO(ScriptDataEscapedDashState); 602 } else if (cc == '<') 603 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState); 604 else if (cc == kEndOfFileMarker) { 605 parseError(); 606 HTML_RECONSUME_IN(DataState); 607 } else { 608 bufferCharacter(cc); 609 HTML_ADVANCE_TO(ScriptDataEscapedState); 610 } 611 } 612 END_STATE() 613 614 HTML_BEGIN_STATE(ScriptDataEscapedDashState) { 615 if (cc == '-') { 616 bufferCharacter(cc); 617 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState); 618 } else if (cc == '<') 619 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState); 620 else if (cc == kEndOfFileMarker) { 621 parseError(); 622 HTML_RECONSUME_IN(DataState); 623 } else { 624 bufferCharacter(cc); 625 HTML_ADVANCE_TO(ScriptDataEscapedState); 626 } 627 } 628 END_STATE() 629 630 HTML_BEGIN_STATE(ScriptDataEscapedDashDashState) { 631 if (cc == '-') { 632 bufferCharacter(cc); 633 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState); 634 } else if (cc == '<') 635 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState); 636 else if (cc == '>') { 637 bufferCharacter(cc); 638 HTML_ADVANCE_TO(ScriptDataState); 639 } else if (cc == kEndOfFileMarker) { 640 parseError(); 641 HTML_RECONSUME_IN(DataState); 642 } else { 643 bufferCharacter(cc); 644 HTML_ADVANCE_TO(ScriptDataEscapedState); 645 } 646 } 647 END_STATE() 648 649 HTML_BEGIN_STATE(ScriptDataEscapedLessThanSignState) { 650 if (cc == '/') { 651 m_temporaryBuffer.clear(); 652 ASSERT(m_bufferedEndTagName.isEmpty()); 653 HTML_ADVANCE_TO(ScriptDataEscapedEndTagOpenState); 654 } else if (isASCIIUpper(cc)) { 655 bufferCharacter('<'); 656 bufferCharacter(cc); 657 m_temporaryBuffer.clear(); 658 m_temporaryBuffer.append(toLowerCase(cc)); 659 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 660 } else if (isASCIILower(cc)) { 661 bufferCharacter('<'); 662 bufferCharacter(cc); 663 m_temporaryBuffer.clear(); 664 m_temporaryBuffer.append(static_cast<LChar>(cc)); 665 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 666 } else { 667 bufferCharacter('<'); 668 HTML_RECONSUME_IN(ScriptDataEscapedState); 669 } 670 } 671 END_STATE() 672 673 HTML_BEGIN_STATE(ScriptDataEscapedEndTagOpenState) { 674 if (isASCIIUpper(cc)) { 675 m_temporaryBuffer.append(static_cast<LChar>(cc)); 676 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 677 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 678 } else if (isASCIILower(cc)) { 679 m_temporaryBuffer.append(static_cast<LChar>(cc)); 680 addToPossibleEndTag(static_cast<LChar>(cc)); 681 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 682 } else { 683 bufferCharacter('<'); 684 bufferCharacter('/'); 685 HTML_RECONSUME_IN(ScriptDataEscapedState); 686 } 687 } 688 END_STATE() 689 690 HTML_BEGIN_STATE(ScriptDataEscapedEndTagNameState) { 691 if (isASCIIUpper(cc)) { 692 m_temporaryBuffer.append(static_cast<LChar>(cc)); 693 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 694 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 695 } else if (isASCIILower(cc)) { 696 m_temporaryBuffer.append(static_cast<LChar>(cc)); 697 addToPossibleEndTag(static_cast<LChar>(cc)); 698 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 699 } else { 700 if (isTokenizerWhitespace(cc)) { 701 if (isAppropriateEndTag()) { 702 m_temporaryBuffer.append(static_cast<LChar>(cc)); 703 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 704 } 705 } else if (cc == '/') { 706 if (isAppropriateEndTag()) { 707 m_temporaryBuffer.append(static_cast<LChar>(cc)); 708 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 709 } 710 } else if (cc == '>') { 711 if (isAppropriateEndTag()) { 712 m_temporaryBuffer.append(static_cast<LChar>(cc)); 713 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 714 } 715 } 716 bufferCharacter('<'); 717 bufferCharacter('/'); 718 m_token->appendToCharacter(m_temporaryBuffer); 719 m_bufferedEndTagName.clear(); 720 m_temporaryBuffer.clear(); 721 HTML_RECONSUME_IN(ScriptDataEscapedState); 722 } 723 } 724 END_STATE() 725 726 HTML_BEGIN_STATE(ScriptDataDoubleEscapeStartState) { 727 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') { 728 bufferCharacter(cc); 729 if (temporaryBufferIs(scriptTag.localName())) 730 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 731 else 732 HTML_ADVANCE_TO(ScriptDataEscapedState); 733 } else if (isASCIIUpper(cc)) { 734 bufferCharacter(cc); 735 m_temporaryBuffer.append(toLowerCase(cc)); 736 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 737 } else if (isASCIILower(cc)) { 738 bufferCharacter(cc); 739 m_temporaryBuffer.append(static_cast<LChar>(cc)); 740 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 741 } else 742 HTML_RECONSUME_IN(ScriptDataEscapedState); 743 } 744 END_STATE() 745 746 HTML_BEGIN_STATE(ScriptDataDoubleEscapedState) { 747 if (cc == '-') { 748 bufferCharacter(cc); 749 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashState); 750 } else if (cc == '<') { 751 bufferCharacter(cc); 752 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 753 } else if (cc == kEndOfFileMarker) { 754 parseError(); 755 HTML_RECONSUME_IN(DataState); 756 } else { 757 bufferCharacter(cc); 758 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 759 } 760 } 761 END_STATE() 762 763 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashState) { 764 if (cc == '-') { 765 bufferCharacter(cc); 766 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState); 767 } else if (cc == '<') { 768 bufferCharacter(cc); 769 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 770 } else if (cc == kEndOfFileMarker) { 771 parseError(); 772 HTML_RECONSUME_IN(DataState); 773 } else { 774 bufferCharacter(cc); 775 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 776 } 777 } 778 END_STATE() 779 780 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) { 781 if (cc == '-') { 782 bufferCharacter(cc); 783 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState); 784 } else if (cc == '<') { 785 bufferCharacter(cc); 786 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 787 } else if (cc == '>') { 788 bufferCharacter(cc); 789 HTML_ADVANCE_TO(ScriptDataState); 790 } else if (cc == kEndOfFileMarker) { 791 parseError(); 792 HTML_RECONSUME_IN(DataState); 793 } else { 794 bufferCharacter(cc); 795 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 796 } 797 } 798 END_STATE() 799 800 HTML_BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) { 801 if (cc == '/') { 802 bufferCharacter(cc); 803 m_temporaryBuffer.clear(); 804 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState); 805 } else 806 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState); 807 } 808 END_STATE() 809 810 HTML_BEGIN_STATE(ScriptDataDoubleEscapeEndState) { 811 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') { 812 bufferCharacter(cc); 813 if (temporaryBufferIs(scriptTag.localName())) 814 HTML_ADVANCE_TO(ScriptDataEscapedState); 815 else 816 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 817 } else if (isASCIIUpper(cc)) { 818 bufferCharacter(cc); 819 m_temporaryBuffer.append(toLowerCase(cc)); 820 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState); 821 } else if (isASCIILower(cc)) { 822 bufferCharacter(cc); 823 m_temporaryBuffer.append(static_cast<LChar>(cc)); 824 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState); 825 } else 826 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState); 827 } 828 END_STATE() 829 830 HTML_BEGIN_STATE(BeforeAttributeNameState) { 831 if (isTokenizerWhitespace(cc)) 832 HTML_ADVANCE_TO(BeforeAttributeNameState); 833 else if (cc == '/') 834 HTML_ADVANCE_TO(SelfClosingStartTagState); 835 else if (cc == '>') 836 return emitAndResumeIn(source, HTMLTokenizer::DataState); 837 else if (isASCIIUpper(cc)) { 838 m_token->addNewAttribute(); 839 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 840 m_token->appendToAttributeName(toLowerCase(cc)); 841 HTML_ADVANCE_TO(AttributeNameState); 842 } else if (cc == kEndOfFileMarker) { 843 parseError(); 844 HTML_RECONSUME_IN(DataState); 845 } else { 846 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=') 847 parseError(); 848 m_token->addNewAttribute(); 849 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 850 m_token->appendToAttributeName(cc); 851 HTML_ADVANCE_TO(AttributeNameState); 852 } 853 } 854 END_STATE() 855 856 HTML_BEGIN_STATE(AttributeNameState) { 857 if (isTokenizerWhitespace(cc)) { 858 m_token->endAttributeName(source.numberOfCharactersConsumed()); 859 HTML_ADVANCE_TO(AfterAttributeNameState); 860 } else if (cc == '/') { 861 m_token->endAttributeName(source.numberOfCharactersConsumed()); 862 HTML_ADVANCE_TO(SelfClosingStartTagState); 863 } else if (cc == '=') { 864 m_token->endAttributeName(source.numberOfCharactersConsumed()); 865 HTML_ADVANCE_TO(BeforeAttributeValueState); 866 } else if (cc == '>') { 867 m_token->endAttributeName(source.numberOfCharactersConsumed()); 868 return emitAndResumeIn(source, HTMLTokenizer::DataState); 869 } else if (isASCIIUpper(cc)) { 870 m_token->appendToAttributeName(toLowerCase(cc)); 871 HTML_ADVANCE_TO(AttributeNameState); 872 } else if (cc == kEndOfFileMarker) { 873 parseError(); 874 m_token->endAttributeName(source.numberOfCharactersConsumed()); 875 HTML_RECONSUME_IN(DataState); 876 } else { 877 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=') 878 parseError(); 879 m_token->appendToAttributeName(cc); 880 HTML_ADVANCE_TO(AttributeNameState); 881 } 882 } 883 END_STATE() 884 885 HTML_BEGIN_STATE(AfterAttributeNameState) { 886 if (isTokenizerWhitespace(cc)) 887 HTML_ADVANCE_TO(AfterAttributeNameState); 888 else if (cc == '/') 889 HTML_ADVANCE_TO(SelfClosingStartTagState); 890 else if (cc == '=') 891 HTML_ADVANCE_TO(BeforeAttributeValueState); 892 else if (cc == '>') 893 return emitAndResumeIn(source, HTMLTokenizer::DataState); 894 else if (isASCIIUpper(cc)) { 895 m_token->addNewAttribute(); 896 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 897 m_token->appendToAttributeName(toLowerCase(cc)); 898 HTML_ADVANCE_TO(AttributeNameState); 899 } else if (cc == kEndOfFileMarker) { 900 parseError(); 901 HTML_RECONSUME_IN(DataState); 902 } else { 903 if (cc == '"' || cc == '\'' || cc == '<') 904 parseError(); 905 m_token->addNewAttribute(); 906 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 907 m_token->appendToAttributeName(cc); 908 HTML_ADVANCE_TO(AttributeNameState); 909 } 910 } 911 END_STATE() 912 913 HTML_BEGIN_STATE(BeforeAttributeValueState) { 914 if (isTokenizerWhitespace(cc)) 915 HTML_ADVANCE_TO(BeforeAttributeValueState); 916 else if (cc == '"') { 917 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1); 918 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); 919 } else if (cc == '&') { 920 m_token->beginAttributeValue(source.numberOfCharactersConsumed()); 921 HTML_RECONSUME_IN(AttributeValueUnquotedState); 922 } else if (cc == '\'') { 923 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1); 924 HTML_ADVANCE_TO(AttributeValueSingleQuotedState); 925 } else if (cc == '>') { 926 parseError(); 927 return emitAndResumeIn(source, HTMLTokenizer::DataState); 928 } else if (cc == kEndOfFileMarker) { 929 parseError(); 930 HTML_RECONSUME_IN(DataState); 931 } else { 932 if (cc == '<' || cc == '=' || cc == '`') 933 parseError(); 934 m_token->beginAttributeValue(source.numberOfCharactersConsumed()); 935 m_token->appendToAttributeValue(cc); 936 HTML_ADVANCE_TO(AttributeValueUnquotedState); 937 } 938 } 939 END_STATE() 940 941 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) { 942 if (cc == '"') { 943 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 944 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); 945 } else if (cc == '&') { 946 m_additionalAllowedCharacter = '"'; 947 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 948 } else if (cc == kEndOfFileMarker) { 949 parseError(); 950 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 951 HTML_RECONSUME_IN(DataState); 952 } else { 953 m_token->appendToAttributeValue(cc); 954 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); 955 } 956 } 957 END_STATE() 958 959 HTML_BEGIN_STATE(AttributeValueSingleQuotedState) { 960 if (cc == '\'') { 961 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 962 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); 963 } else if (cc == '&') { 964 m_additionalAllowedCharacter = '\''; 965 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 966 } else if (cc == kEndOfFileMarker) { 967 parseError(); 968 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 969 HTML_RECONSUME_IN(DataState); 970 } else { 971 m_token->appendToAttributeValue(cc); 972 HTML_ADVANCE_TO(AttributeValueSingleQuotedState); 973 } 974 } 975 END_STATE() 976 977 HTML_BEGIN_STATE(AttributeValueUnquotedState) { 978 if (isTokenizerWhitespace(cc)) { 979 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 980 HTML_ADVANCE_TO(BeforeAttributeNameState); 981 } else if (cc == '&') { 982 m_additionalAllowedCharacter = '>'; 983 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 984 } else if (cc == '>') { 985 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 986 return emitAndResumeIn(source, HTMLTokenizer::DataState); 987 } else if (cc == kEndOfFileMarker) { 988 parseError(); 989 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 990 HTML_RECONSUME_IN(DataState); 991 } else { 992 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`') 993 parseError(); 994 m_token->appendToAttributeValue(cc); 995 HTML_ADVANCE_TO(AttributeValueUnquotedState); 996 } 997 } 998 END_STATE() 999 1000 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) { 1001 bool notEnoughCharacters = false; 1002 DecodedHTMLEntity decodedEntity; 1003 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter); 1004 if (notEnoughCharacters) 1005 return haveBufferedCharacterToken(); 1006 if (!success) { 1007 ASSERT(decodedEntity.isEmpty()); 1008 m_token->appendToAttributeValue('&'); 1009 } else { 1010 for (unsigned i = 0; i < decodedEntity.length; ++i) 1011 m_token->appendToAttributeValue(decodedEntity.data[i]); 1012 } 1013 // We're supposed to switch back to the attribute value state that 1014 // we were in when we were switched into this state. Rather than 1015 // keeping track of this explictly, we observe that the previous 1016 // state can be determined by m_additionalAllowedCharacter. 1017 if (m_additionalAllowedCharacter == '"') 1018 HTML_SWITCH_TO(AttributeValueDoubleQuotedState); 1019 else if (m_additionalAllowedCharacter == '\'') 1020 HTML_SWITCH_TO(AttributeValueSingleQuotedState); 1021 else if (m_additionalAllowedCharacter == '>') 1022 HTML_SWITCH_TO(AttributeValueUnquotedState); 1023 else 1024 ASSERT_NOT_REACHED(); 1025 } 1026 END_STATE() 1027 1028 HTML_BEGIN_STATE(AfterAttributeValueQuotedState) { 1029 if (isTokenizerWhitespace(cc)) 1030 HTML_ADVANCE_TO(BeforeAttributeNameState); 1031 else if (cc == '/') 1032 HTML_ADVANCE_TO(SelfClosingStartTagState); 1033 else if (cc == '>') 1034 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1035 else if (cc == kEndOfFileMarker) { 1036 parseError(); 1037 HTML_RECONSUME_IN(DataState); 1038 } else { 1039 parseError(); 1040 HTML_RECONSUME_IN(BeforeAttributeNameState); 1041 } 1042 } 1043 END_STATE() 1044 1045 HTML_BEGIN_STATE(SelfClosingStartTagState) { 1046 if (cc == '>') { 1047 m_token->setSelfClosing(); 1048 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1049 } else if (cc == kEndOfFileMarker) { 1050 parseError(); 1051 HTML_RECONSUME_IN(DataState); 1052 } else { 1053 parseError(); 1054 HTML_RECONSUME_IN(BeforeAttributeNameState); 1055 } 1056 } 1057 END_STATE() 1058 1059 HTML_BEGIN_STATE(BogusCommentState) { 1060 m_token->beginComment(); 1061 HTML_RECONSUME_IN(ContinueBogusCommentState); 1062 } 1063 END_STATE() 1064 1065 HTML_BEGIN_STATE(ContinueBogusCommentState) { 1066 if (cc == '>') 1067 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1068 else if (cc == kEndOfFileMarker) 1069 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1070 else { 1071 m_token->appendToComment(cc); 1072 HTML_ADVANCE_TO(ContinueBogusCommentState); 1073 } 1074 } 1075 END_STATE() 1076 1077 HTML_BEGIN_STATE(MarkupDeclarationOpenState) { 1078 if (cc == '-') { 1079 SegmentedString::LookAheadResult result = source.lookAhead(HTMLTokenizerNames::dashDash); 1080 if (result == SegmentedString::DidMatch) { 1081 source.advanceAndASSERT('-'); 1082 source.advanceAndASSERT('-'); 1083 m_token->beginComment(); 1084 HTML_SWITCH_TO(CommentStartState); 1085 } else if (result == SegmentedString::NotEnoughCharacters) 1086 return haveBufferedCharacterToken(); 1087 } else if (cc == 'D' || cc == 'd') { 1088 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(HTMLTokenizerNames::doctype); 1089 if (result == SegmentedString::DidMatch) { 1090 advanceStringAndASSERTIgnoringCase(source, "doctype"); 1091 HTML_SWITCH_TO(DOCTYPEState); 1092 } else if (result == SegmentedString::NotEnoughCharacters) 1093 return haveBufferedCharacterToken(); 1094 } else if (cc == '[' && shouldAllowCDATA()) { 1095 SegmentedString::LookAheadResult result = source.lookAhead(HTMLTokenizerNames::cdata); 1096 if (result == SegmentedString::DidMatch) { 1097 advanceStringAndASSERT(source, "[CDATA["); 1098 HTML_SWITCH_TO(CDATASectionState); 1099 } else if (result == SegmentedString::NotEnoughCharacters) 1100 return haveBufferedCharacterToken(); 1101 } 1102 parseError(); 1103 HTML_RECONSUME_IN(BogusCommentState); 1104 } 1105 END_STATE() 1106 1107 HTML_BEGIN_STATE(CommentStartState) { 1108 if (cc == '-') 1109 HTML_ADVANCE_TO(CommentStartDashState); 1110 else if (cc == '>') { 1111 parseError(); 1112 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1113 } else if (cc == kEndOfFileMarker) { 1114 parseError(); 1115 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1116 } else { 1117 m_token->appendToComment(cc); 1118 HTML_ADVANCE_TO(CommentState); 1119 } 1120 } 1121 END_STATE() 1122 1123 HTML_BEGIN_STATE(CommentStartDashState) { 1124 if (cc == '-') 1125 HTML_ADVANCE_TO(CommentEndState); 1126 else if (cc == '>') { 1127 parseError(); 1128 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1129 } else if (cc == kEndOfFileMarker) { 1130 parseError(); 1131 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1132 } else { 1133 m_token->appendToComment('-'); 1134 m_token->appendToComment(cc); 1135 HTML_ADVANCE_TO(CommentState); 1136 } 1137 } 1138 END_STATE() 1139 1140 HTML_BEGIN_STATE(CommentState) { 1141 if (cc == '-') 1142 HTML_ADVANCE_TO(CommentEndDashState); 1143 else if (cc == kEndOfFileMarker) { 1144 parseError(); 1145 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1146 } else { 1147 m_token->appendToComment(cc); 1148 HTML_ADVANCE_TO(CommentState); 1149 } 1150 } 1151 END_STATE() 1152 1153 HTML_BEGIN_STATE(CommentEndDashState) { 1154 if (cc == '-') 1155 HTML_ADVANCE_TO(CommentEndState); 1156 else if (cc == kEndOfFileMarker) { 1157 parseError(); 1158 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1159 } else { 1160 m_token->appendToComment('-'); 1161 m_token->appendToComment(cc); 1162 HTML_ADVANCE_TO(CommentState); 1163 } 1164 } 1165 END_STATE() 1166 1167 HTML_BEGIN_STATE(CommentEndState) { 1168 if (cc == '>') 1169 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1170 else if (cc == '!') { 1171 parseError(); 1172 HTML_ADVANCE_TO(CommentEndBangState); 1173 } else if (cc == '-') { 1174 parseError(); 1175 m_token->appendToComment('-'); 1176 HTML_ADVANCE_TO(CommentEndState); 1177 } else if (cc == kEndOfFileMarker) { 1178 parseError(); 1179 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1180 } else { 1181 parseError(); 1182 m_token->appendToComment('-'); 1183 m_token->appendToComment('-'); 1184 m_token->appendToComment(cc); 1185 HTML_ADVANCE_TO(CommentState); 1186 } 1187 } 1188 END_STATE() 1189 1190 HTML_BEGIN_STATE(CommentEndBangState) { 1191 if (cc == '-') { 1192 m_token->appendToComment('-'); 1193 m_token->appendToComment('-'); 1194 m_token->appendToComment('!'); 1195 HTML_ADVANCE_TO(CommentEndDashState); 1196 } else if (cc == '>') 1197 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1198 else if (cc == kEndOfFileMarker) { 1199 parseError(); 1200 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1201 } else { 1202 m_token->appendToComment('-'); 1203 m_token->appendToComment('-'); 1204 m_token->appendToComment('!'); 1205 m_token->appendToComment(cc); 1206 HTML_ADVANCE_TO(CommentState); 1207 } 1208 } 1209 END_STATE() 1210 1211 HTML_BEGIN_STATE(DOCTYPEState) { 1212 if (isTokenizerWhitespace(cc)) 1213 HTML_ADVANCE_TO(BeforeDOCTYPENameState); 1214 else if (cc == kEndOfFileMarker) { 1215 parseError(); 1216 m_token->beginDOCTYPE(); 1217 m_token->setForceQuirks(); 1218 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1219 } else { 1220 parseError(); 1221 HTML_RECONSUME_IN(BeforeDOCTYPENameState); 1222 } 1223 } 1224 END_STATE() 1225 1226 HTML_BEGIN_STATE(BeforeDOCTYPENameState) { 1227 if (isTokenizerWhitespace(cc)) 1228 HTML_ADVANCE_TO(BeforeDOCTYPENameState); 1229 else if (isASCIIUpper(cc)) { 1230 m_token->beginDOCTYPE(toLowerCase(cc)); 1231 HTML_ADVANCE_TO(DOCTYPENameState); 1232 } else if (cc == '>') { 1233 parseError(); 1234 m_token->beginDOCTYPE(); 1235 m_token->setForceQuirks(); 1236 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1237 } else if (cc == kEndOfFileMarker) { 1238 parseError(); 1239 m_token->beginDOCTYPE(); 1240 m_token->setForceQuirks(); 1241 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1242 } else { 1243 m_token->beginDOCTYPE(cc); 1244 HTML_ADVANCE_TO(DOCTYPENameState); 1245 } 1246 } 1247 END_STATE() 1248 1249 HTML_BEGIN_STATE(DOCTYPENameState) { 1250 if (isTokenizerWhitespace(cc)) 1251 HTML_ADVANCE_TO(AfterDOCTYPENameState); 1252 else if (cc == '>') 1253 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1254 else if (isASCIIUpper(cc)) { 1255 m_token->appendToName(toLowerCase(cc)); 1256 HTML_ADVANCE_TO(DOCTYPENameState); 1257 } else if (cc == kEndOfFileMarker) { 1258 parseError(); 1259 m_token->setForceQuirks(); 1260 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1261 } else { 1262 m_token->appendToName(cc); 1263 HTML_ADVANCE_TO(DOCTYPENameState); 1264 } 1265 } 1266 END_STATE() 1267 1268 HTML_BEGIN_STATE(AfterDOCTYPENameState) { 1269 if (isTokenizerWhitespace(cc)) 1270 HTML_ADVANCE_TO(AfterDOCTYPENameState); 1271 if (cc == '>') 1272 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1273 else if (cc == kEndOfFileMarker) { 1274 parseError(); 1275 m_token->setForceQuirks(); 1276 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1277 } else { 1278 if (cc == 'P' || cc == 'p') { 1279 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(HTMLTokenizerNames::publicString); 1280 if (result == SegmentedString::DidMatch) { 1281 advanceStringAndASSERTIgnoringCase(source, "public"); 1282 HTML_SWITCH_TO(AfterDOCTYPEPublicKeywordState); 1283 } else if (result == SegmentedString::NotEnoughCharacters) 1284 return haveBufferedCharacterToken(); 1285 } else if (cc == 'S' || cc == 's') { 1286 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(HTMLTokenizerNames::system); 1287 if (result == SegmentedString::DidMatch) { 1288 advanceStringAndASSERTIgnoringCase(source, "system"); 1289 HTML_SWITCH_TO(AfterDOCTYPESystemKeywordState); 1290 } else if (result == SegmentedString::NotEnoughCharacters) 1291 return haveBufferedCharacterToken(); 1292 } 1293 parseError(); 1294 m_token->setForceQuirks(); 1295 HTML_ADVANCE_TO(BogusDOCTYPEState); 1296 } 1297 } 1298 END_STATE() 1299 1300 HTML_BEGIN_STATE(AfterDOCTYPEPublicKeywordState) { 1301 if (isTokenizerWhitespace(cc)) 1302 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState); 1303 else if (cc == '"') { 1304 parseError(); 1305 m_token->setPublicIdentifierToEmptyString(); 1306 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1307 } else if (cc == '\'') { 1308 parseError(); 1309 m_token->setPublicIdentifierToEmptyString(); 1310 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1311 } else if (cc == '>') { 1312 parseError(); 1313 m_token->setForceQuirks(); 1314 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1315 } else if (cc == kEndOfFileMarker) { 1316 parseError(); 1317 m_token->setForceQuirks(); 1318 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1319 } else { 1320 parseError(); 1321 m_token->setForceQuirks(); 1322 HTML_ADVANCE_TO(BogusDOCTYPEState); 1323 } 1324 } 1325 END_STATE() 1326 1327 HTML_BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) { 1328 if (isTokenizerWhitespace(cc)) 1329 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState); 1330 else if (cc == '"') { 1331 m_token->setPublicIdentifierToEmptyString(); 1332 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1333 } else if (cc == '\'') { 1334 m_token->setPublicIdentifierToEmptyString(); 1335 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1336 } else if (cc == '>') { 1337 parseError(); 1338 m_token->setForceQuirks(); 1339 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1340 } else if (cc == kEndOfFileMarker) { 1341 parseError(); 1342 m_token->setForceQuirks(); 1343 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1344 } else { 1345 parseError(); 1346 m_token->setForceQuirks(); 1347 HTML_ADVANCE_TO(BogusDOCTYPEState); 1348 } 1349 } 1350 END_STATE() 1351 1352 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) { 1353 if (cc == '"') 1354 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState); 1355 else if (cc == '>') { 1356 parseError(); 1357 m_token->setForceQuirks(); 1358 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1359 } else if (cc == kEndOfFileMarker) { 1360 parseError(); 1361 m_token->setForceQuirks(); 1362 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1363 } else { 1364 m_token->appendToPublicIdentifier(cc); 1365 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1366 } 1367 } 1368 END_STATE() 1369 1370 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) { 1371 if (cc == '\'') 1372 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState); 1373 else if (cc == '>') { 1374 parseError(); 1375 m_token->setForceQuirks(); 1376 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1377 } else if (cc == kEndOfFileMarker) { 1378 parseError(); 1379 m_token->setForceQuirks(); 1380 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1381 } else { 1382 m_token->appendToPublicIdentifier(cc); 1383 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1384 } 1385 } 1386 END_STATE() 1387 1388 HTML_BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) { 1389 if (isTokenizerWhitespace(cc)) 1390 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState); 1391 else if (cc == '>') 1392 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1393 else if (cc == '"') { 1394 parseError(); 1395 m_token->setSystemIdentifierToEmptyString(); 1396 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1397 } else if (cc == '\'') { 1398 parseError(); 1399 m_token->setSystemIdentifierToEmptyString(); 1400 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1401 } else if (cc == kEndOfFileMarker) { 1402 parseError(); 1403 m_token->setForceQuirks(); 1404 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1405 } else { 1406 parseError(); 1407 m_token->setForceQuirks(); 1408 HTML_ADVANCE_TO(BogusDOCTYPEState); 1409 } 1410 } 1411 END_STATE() 1412 1413 HTML_BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) { 1414 if (isTokenizerWhitespace(cc)) 1415 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState); 1416 else if (cc == '>') 1417 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1418 else if (cc == '"') { 1419 m_token->setSystemIdentifierToEmptyString(); 1420 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1421 } else if (cc == '\'') { 1422 m_token->setSystemIdentifierToEmptyString(); 1423 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1424 } else if (cc == kEndOfFileMarker) { 1425 parseError(); 1426 m_token->setForceQuirks(); 1427 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1428 } else { 1429 parseError(); 1430 m_token->setForceQuirks(); 1431 HTML_ADVANCE_TO(BogusDOCTYPEState); 1432 } 1433 } 1434 END_STATE() 1435 1436 HTML_BEGIN_STATE(AfterDOCTYPESystemKeywordState) { 1437 if (isTokenizerWhitespace(cc)) 1438 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState); 1439 else if (cc == '"') { 1440 parseError(); 1441 m_token->setSystemIdentifierToEmptyString(); 1442 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1443 } else if (cc == '\'') { 1444 parseError(); 1445 m_token->setSystemIdentifierToEmptyString(); 1446 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1447 } else if (cc == '>') { 1448 parseError(); 1449 m_token->setForceQuirks(); 1450 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1451 } else if (cc == kEndOfFileMarker) { 1452 parseError(); 1453 m_token->setForceQuirks(); 1454 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1455 } else { 1456 parseError(); 1457 m_token->setForceQuirks(); 1458 HTML_ADVANCE_TO(BogusDOCTYPEState); 1459 } 1460 } 1461 END_STATE() 1462 1463 HTML_BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) { 1464 if (isTokenizerWhitespace(cc)) 1465 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState); 1466 if (cc == '"') { 1467 m_token->setSystemIdentifierToEmptyString(); 1468 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1469 } else if (cc == '\'') { 1470 m_token->setSystemIdentifierToEmptyString(); 1471 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1472 } else if (cc == '>') { 1473 parseError(); 1474 m_token->setForceQuirks(); 1475 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1476 } else if (cc == kEndOfFileMarker) { 1477 parseError(); 1478 m_token->setForceQuirks(); 1479 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1480 } else { 1481 parseError(); 1482 m_token->setForceQuirks(); 1483 HTML_ADVANCE_TO(BogusDOCTYPEState); 1484 } 1485 } 1486 END_STATE() 1487 1488 HTML_BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) { 1489 if (cc == '"') 1490 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1491 else if (cc == '>') { 1492 parseError(); 1493 m_token->setForceQuirks(); 1494 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1495 } else if (cc == kEndOfFileMarker) { 1496 parseError(); 1497 m_token->setForceQuirks(); 1498 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1499 } else { 1500 m_token->appendToSystemIdentifier(cc); 1501 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1502 } 1503 } 1504 END_STATE() 1505 1506 HTML_BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) { 1507 if (cc == '\'') 1508 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1509 else if (cc == '>') { 1510 parseError(); 1511 m_token->setForceQuirks(); 1512 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1513 } else if (cc == kEndOfFileMarker) { 1514 parseError(); 1515 m_token->setForceQuirks(); 1516 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1517 } else { 1518 m_token->appendToSystemIdentifier(cc); 1519 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1520 } 1521 } 1522 END_STATE() 1523 1524 HTML_BEGIN_STATE(AfterDOCTYPESystemIdentifierState) { 1525 if (isTokenizerWhitespace(cc)) 1526 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1527 else if (cc == '>') 1528 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1529 else if (cc == kEndOfFileMarker) { 1530 parseError(); 1531 m_token->setForceQuirks(); 1532 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1533 } else { 1534 parseError(); 1535 HTML_ADVANCE_TO(BogusDOCTYPEState); 1536 } 1537 } 1538 END_STATE() 1539 1540 HTML_BEGIN_STATE(BogusDOCTYPEState) { 1541 if (cc == '>') 1542 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1543 else if (cc == kEndOfFileMarker) 1544 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1545 HTML_ADVANCE_TO(BogusDOCTYPEState); 1546 } 1547 END_STATE() 1548 1549 HTML_BEGIN_STATE(CDATASectionState) { 1550 if (cc == ']') 1551 HTML_ADVANCE_TO(CDATASectionRightSquareBracketState); 1552 else if (cc == kEndOfFileMarker) 1553 HTML_RECONSUME_IN(DataState); 1554 else { 1555 bufferCharacter(cc); 1556 HTML_ADVANCE_TO(CDATASectionState); 1557 } 1558 } 1559 END_STATE() 1560 1561 HTML_BEGIN_STATE(CDATASectionRightSquareBracketState) { 1562 if (cc == ']') 1563 HTML_ADVANCE_TO(CDATASectionDoubleRightSquareBracketState); 1564 else { 1565 bufferCharacter(']'); 1566 HTML_RECONSUME_IN(CDATASectionState); 1567 } 1568 } 1569 1570 HTML_BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) { 1571 if (cc == '>') 1572 HTML_ADVANCE_TO(DataState); 1573 else { 1574 bufferCharacter(']'); 1575 bufferCharacter(']'); 1576 HTML_RECONSUME_IN(CDATASectionState); 1577 } 1578 } 1579 END_STATE() 1580 1581 } 1582 1583 ASSERT_NOT_REACHED(); 1584 return false; 1585 } 1586 1587 String HTMLTokenizer::bufferedCharacters() const 1588 { 1589 // FIXME: Add an assert about m_state. 1590 StringBuilder characters; 1591 characters.reserveCapacity(numberOfBufferedCharacters()); 1592 characters.append('<'); 1593 characters.append('/'); 1594 characters.append(m_temporaryBuffer.data(), m_temporaryBuffer.size()); 1595 return characters.toString(); 1596 } 1597 1598 void HTMLTokenizer::updateStateFor(const String& tagName) 1599 { 1600 if (threadSafeMatch(tagName, textareaTag) || threadSafeMatch(tagName, titleTag)) 1601 setState(HTMLTokenizer::RCDATAState); 1602 else if (threadSafeMatch(tagName, plaintextTag)) 1603 setState(HTMLTokenizer::PLAINTEXTState); 1604 else if (threadSafeMatch(tagName, scriptTag)) 1605 setState(HTMLTokenizer::ScriptDataState); 1606 else if (threadSafeMatch(tagName, styleTag) 1607 || threadSafeMatch(tagName, iframeTag) 1608 || threadSafeMatch(tagName, xmpTag) 1609 || (threadSafeMatch(tagName, noembedTag) && m_options.pluginsEnabled) 1610 || threadSafeMatch(tagName, noframesTag) 1611 || (threadSafeMatch(tagName, noscriptTag) && m_options.scriptEnabled)) 1612 setState(HTMLTokenizer::RAWTEXTState); 1613 } 1614 1615 inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString) 1616 { 1617 return vectorEqualsString(m_temporaryBuffer, expectedString); 1618 } 1619 1620 inline void HTMLTokenizer::addToPossibleEndTag(LChar cc) 1621 { 1622 ASSERT(isEndTagBufferingState(m_state)); 1623 m_bufferedEndTagName.append(cc); 1624 } 1625 1626 inline bool HTMLTokenizer::isAppropriateEndTag() 1627 { 1628 if (m_bufferedEndTagName.size() != m_appropriateEndTagName.size()) 1629 return false; 1630 1631 size_t numCharacters = m_bufferedEndTagName.size(); 1632 1633 for (size_t i = 0; i < numCharacters; i++) { 1634 if (m_bufferedEndTagName[i] != m_appropriateEndTagName[i]) 1635 return false; 1636 } 1637 1638 return true; 1639 } 1640 1641 inline void HTMLTokenizer::parseError() 1642 { 1643 notImplemented(); 1644 } 1645 1646 } 1647