1 /* 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ 4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 #include "core/html/parser/HTMLTokenizer.h" 30 31 #include "HTMLNames.h" 32 #include "core/html/parser/HTMLEntityParser.h" 33 #include "core/html/parser/HTMLTreeBuilder.h" 34 #include "platform/NotImplemented.h" 35 #include "core/xml/parser/MarkupTokenizerInlines.h" 36 #include "wtf/ASCIICType.h" 37 #include "wtf/text/AtomicString.h" 38 #include "wtf/unicode/Unicode.h" 39 40 using namespace WTF; 41 42 namespace WebCore { 43 44 using namespace HTMLNames; 45 46 // This has to go in a .cpp file, as the linker doesn't like it being included more than once. 47 // We don't have an HTMLToken.cpp though, so this is the next best place. 48 QualifiedName AtomicHTMLToken::nameForAttribute(const HTMLToken::Attribute& attribute) const 49 { 50 return QualifiedName(nullAtom, AtomicString(attribute.name), nullAtom); 51 } 52 53 bool AtomicHTMLToken::usesName() const 54 { 55 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE; 56 } 57 58 bool AtomicHTMLToken::usesAttributes() const 59 { 60 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag; 61 } 62 63 static inline UChar toLowerCase(UChar cc) 64 { 65 ASSERT(isASCIIUpper(cc)); 66 const int lowerCaseOffset = 0x20; 67 return cc + lowerCaseOffset; 68 } 69 70 static inline bool vectorEqualsString(const Vector<LChar, 32>& vector, const String& string) 71 { 72 if (vector.size() != string.length()) 73 return false; 74 75 if (!string.length()) 76 return true; 77 78 return equal(string.impl(), vector.data(), vector.size()); 79 } 80 81 static inline bool isEndTagBufferingState(HTMLTokenizer::State state) 82 { 83 switch (state) { 84 case HTMLTokenizer::RCDATAEndTagOpenState: 85 case HTMLTokenizer::RCDATAEndTagNameState: 86 case HTMLTokenizer::RAWTEXTEndTagOpenState: 87 case HTMLTokenizer::RAWTEXTEndTagNameState: 88 case HTMLTokenizer::ScriptDataEndTagOpenState: 89 case HTMLTokenizer::ScriptDataEndTagNameState: 90 case HTMLTokenizer::ScriptDataEscapedEndTagOpenState: 91 case HTMLTokenizer::ScriptDataEscapedEndTagNameState: 92 return true; 93 default: 94 return false; 95 } 96 } 97 98 #define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizer, stateName) 99 #define HTML_RECONSUME_IN(stateName) RECONSUME_IN(HTMLTokenizer, stateName) 100 #define HTML_ADVANCE_TO(stateName) ADVANCE_TO(HTMLTokenizer, stateName) 101 #define HTML_SWITCH_TO(stateName) SWITCH_TO(HTMLTokenizer, stateName) 102 103 HTMLTokenizer::HTMLTokenizer(const HTMLParserOptions& options) 104 : m_inputStreamPreprocessor(this) 105 , m_options(options) 106 { 107 reset(); 108 } 109 110 HTMLTokenizer::~HTMLTokenizer() 111 { 112 } 113 114 void HTMLTokenizer::reset() 115 { 116 m_state = HTMLTokenizer::DataState; 117 m_token = 0; 118 m_forceNullCharacterReplacement = false; 119 m_shouldAllowCDATA = false; 120 m_additionalAllowedCharacter = '\0'; 121 } 122 123 bool HTMLTokenizer::canCreateCheckpoint() const 124 { 125 if (!m_appropriateEndTagName.isEmpty()) 126 return false; 127 if (!m_temporaryBuffer.isEmpty()) 128 return false; 129 if (!m_bufferedEndTagName.isEmpty()) 130 return false; 131 return true; 132 } 133 134 void HTMLTokenizer::createCheckpoint(Checkpoint& result) const 135 { 136 ASSERT(canCreateCheckpoint()); 137 result.options = m_options; 138 result.state = m_state; 139 result.additionalAllowedCharacter = m_additionalAllowedCharacter; 140 result.skipNextNewLine = m_inputStreamPreprocessor.skipNextNewLine(); 141 result.shouldAllowCDATA = m_shouldAllowCDATA; 142 } 143 144 void HTMLTokenizer::restoreFromCheckpoint(const Checkpoint& checkpoint) 145 { 146 m_token = 0; 147 m_options = checkpoint.options; 148 m_state = checkpoint.state; 149 m_additionalAllowedCharacter = checkpoint.additionalAllowedCharacter; 150 m_inputStreamPreprocessor.reset(checkpoint.skipNextNewLine); 151 m_shouldAllowCDATA = checkpoint.shouldAllowCDATA; 152 } 153 154 inline bool HTMLTokenizer::processEntity(SegmentedString& source) 155 { 156 bool notEnoughCharacters = false; 157 DecodedHTMLEntity decodedEntity; 158 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters); 159 if (notEnoughCharacters) 160 return false; 161 if (!success) { 162 ASSERT(decodedEntity.isEmpty()); 163 bufferCharacter('&'); 164 } else { 165 for (unsigned i = 0; i < decodedEntity.length; ++i) 166 bufferCharacter(decodedEntity.data[i]); 167 } 168 return true; 169 } 170 171 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source) 172 { 173 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized); 174 source.advanceAndUpdateLineNumber(); 175 if (m_token->type() == HTMLToken::Character) 176 return true; 177 m_token->beginEndTag(m_bufferedEndTagName); 178 m_bufferedEndTagName.clear(); 179 m_appropriateEndTagName.clear(); 180 m_temporaryBuffer.clear(); 181 return false; 182 } 183 184 #define FLUSH_AND_ADVANCE_TO(stateName) \ 185 do { \ 186 m_state = HTMLTokenizer::stateName; \ 187 if (flushBufferedEndTag(source)) \ 188 return true; \ 189 if (source.isEmpty() \ 190 || !m_inputStreamPreprocessor.peek(source)) \ 191 return haveBufferedCharacterToken(); \ 192 cc = m_inputStreamPreprocessor.nextInputCharacter(); \ 193 goto stateName; \ 194 } while (false) 195 196 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer::State state) 197 { 198 m_state = state; 199 flushBufferedEndTag(source); 200 return true; 201 } 202 203 bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token) 204 { 205 // If we have a token in progress, then we're supposed to be called back 206 // with the same token so we can finish it. 207 ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized); 208 m_token = &token; 209 210 if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) { 211 // FIXME: This should call flushBufferedEndTag(). 212 // We started an end tag during our last iteration. 213 m_token->beginEndTag(m_bufferedEndTagName); 214 m_bufferedEndTagName.clear(); 215 m_appropriateEndTagName.clear(); 216 m_temporaryBuffer.clear(); 217 if (m_state == HTMLTokenizer::DataState) { 218 // We're back in the data state, so we must be done with the tag. 219 return true; 220 } 221 } 222 223 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source)) 224 return haveBufferedCharacterToken(); 225 UChar cc = m_inputStreamPreprocessor.nextInputCharacter(); 226 227 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0 228 switch (m_state) { 229 HTML_BEGIN_STATE(DataState) { 230 if (cc == '&') 231 HTML_ADVANCE_TO(CharacterReferenceInDataState); 232 else if (cc == '<') { 233 if (m_token->type() == HTMLToken::Character) { 234 // We have a bunch of character tokens queued up that we 235 // are emitting lazily here. 236 return true; 237 } 238 HTML_ADVANCE_TO(TagOpenState); 239 } else if (cc == kEndOfFileMarker) 240 return emitEndOfFile(source); 241 else { 242 bufferCharacter(cc); 243 HTML_ADVANCE_TO(DataState); 244 } 245 } 246 END_STATE() 247 248 HTML_BEGIN_STATE(CharacterReferenceInDataState) { 249 if (!processEntity(source)) 250 return haveBufferedCharacterToken(); 251 HTML_SWITCH_TO(DataState); 252 } 253 END_STATE() 254 255 HTML_BEGIN_STATE(RCDATAState) { 256 if (cc == '&') 257 HTML_ADVANCE_TO(CharacterReferenceInRCDATAState); 258 else if (cc == '<') 259 HTML_ADVANCE_TO(RCDATALessThanSignState); 260 else if (cc == kEndOfFileMarker) 261 return emitEndOfFile(source); 262 else { 263 bufferCharacter(cc); 264 HTML_ADVANCE_TO(RCDATAState); 265 } 266 } 267 END_STATE() 268 269 HTML_BEGIN_STATE(CharacterReferenceInRCDATAState) { 270 if (!processEntity(source)) 271 return haveBufferedCharacterToken(); 272 HTML_SWITCH_TO(RCDATAState); 273 } 274 END_STATE() 275 276 HTML_BEGIN_STATE(RAWTEXTState) { 277 if (cc == '<') 278 HTML_ADVANCE_TO(RAWTEXTLessThanSignState); 279 else if (cc == kEndOfFileMarker) 280 return emitEndOfFile(source); 281 else { 282 bufferCharacter(cc); 283 HTML_ADVANCE_TO(RAWTEXTState); 284 } 285 } 286 END_STATE() 287 288 HTML_BEGIN_STATE(ScriptDataState) { 289 if (cc == '<') 290 HTML_ADVANCE_TO(ScriptDataLessThanSignState); 291 else if (cc == kEndOfFileMarker) 292 return emitEndOfFile(source); 293 else { 294 bufferCharacter(cc); 295 HTML_ADVANCE_TO(ScriptDataState); 296 } 297 } 298 END_STATE() 299 300 HTML_BEGIN_STATE(PLAINTEXTState) { 301 if (cc == kEndOfFileMarker) 302 return emitEndOfFile(source); 303 bufferCharacter(cc); 304 HTML_ADVANCE_TO(PLAINTEXTState); 305 } 306 END_STATE() 307 308 HTML_BEGIN_STATE(TagOpenState) { 309 if (cc == '!') 310 HTML_ADVANCE_TO(MarkupDeclarationOpenState); 311 else if (cc == '/') 312 HTML_ADVANCE_TO(EndTagOpenState); 313 else if (isASCIIUpper(cc)) { 314 m_token->beginStartTag(toLowerCase(cc)); 315 HTML_ADVANCE_TO(TagNameState); 316 } else if (isASCIILower(cc)) { 317 m_token->beginStartTag(cc); 318 HTML_ADVANCE_TO(TagNameState); 319 } else if (cc == '?') { 320 parseError(); 321 // The spec consumes the current character before switching 322 // to the bogus comment state, but it's easier to implement 323 // if we reconsume the current character. 324 HTML_RECONSUME_IN(BogusCommentState); 325 } else { 326 parseError(); 327 bufferCharacter('<'); 328 HTML_RECONSUME_IN(DataState); 329 } 330 } 331 END_STATE() 332 333 HTML_BEGIN_STATE(EndTagOpenState) { 334 if (isASCIIUpper(cc)) { 335 m_token->beginEndTag(static_cast<LChar>(toLowerCase(cc))); 336 m_appropriateEndTagName.clear(); 337 HTML_ADVANCE_TO(TagNameState); 338 } else if (isASCIILower(cc)) { 339 m_token->beginEndTag(static_cast<LChar>(cc)); 340 m_appropriateEndTagName.clear(); 341 HTML_ADVANCE_TO(TagNameState); 342 } else if (cc == '>') { 343 parseError(); 344 HTML_ADVANCE_TO(DataState); 345 } else if (cc == kEndOfFileMarker) { 346 parseError(); 347 bufferCharacter('<'); 348 bufferCharacter('/'); 349 HTML_RECONSUME_IN(DataState); 350 } else { 351 parseError(); 352 HTML_RECONSUME_IN(BogusCommentState); 353 } 354 } 355 END_STATE() 356 357 HTML_BEGIN_STATE(TagNameState) { 358 if (isTokenizerWhitespace(cc)) 359 HTML_ADVANCE_TO(BeforeAttributeNameState); 360 else if (cc == '/') 361 HTML_ADVANCE_TO(SelfClosingStartTagState); 362 else if (cc == '>') 363 return emitAndResumeIn(source, HTMLTokenizer::DataState); 364 else if (isASCIIUpper(cc)) { 365 m_token->appendToName(toLowerCase(cc)); 366 HTML_ADVANCE_TO(TagNameState); 367 } else if (cc == kEndOfFileMarker) { 368 parseError(); 369 HTML_RECONSUME_IN(DataState); 370 } else { 371 m_token->appendToName(cc); 372 HTML_ADVANCE_TO(TagNameState); 373 } 374 } 375 END_STATE() 376 377 HTML_BEGIN_STATE(RCDATALessThanSignState) { 378 if (cc == '/') { 379 m_temporaryBuffer.clear(); 380 ASSERT(m_bufferedEndTagName.isEmpty()); 381 HTML_ADVANCE_TO(RCDATAEndTagOpenState); 382 } else { 383 bufferCharacter('<'); 384 HTML_RECONSUME_IN(RCDATAState); 385 } 386 } 387 END_STATE() 388 389 HTML_BEGIN_STATE(RCDATAEndTagOpenState) { 390 if (isASCIIUpper(cc)) { 391 m_temporaryBuffer.append(static_cast<LChar>(cc)); 392 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 393 HTML_ADVANCE_TO(RCDATAEndTagNameState); 394 } else if (isASCIILower(cc)) { 395 m_temporaryBuffer.append(static_cast<LChar>(cc)); 396 addToPossibleEndTag(static_cast<LChar>(cc)); 397 HTML_ADVANCE_TO(RCDATAEndTagNameState); 398 } else { 399 bufferCharacter('<'); 400 bufferCharacter('/'); 401 HTML_RECONSUME_IN(RCDATAState); 402 } 403 } 404 END_STATE() 405 406 HTML_BEGIN_STATE(RCDATAEndTagNameState) { 407 if (isASCIIUpper(cc)) { 408 m_temporaryBuffer.append(static_cast<LChar>(cc)); 409 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 410 HTML_ADVANCE_TO(RCDATAEndTagNameState); 411 } else if (isASCIILower(cc)) { 412 m_temporaryBuffer.append(static_cast<LChar>(cc)); 413 addToPossibleEndTag(static_cast<LChar>(cc)); 414 HTML_ADVANCE_TO(RCDATAEndTagNameState); 415 } else { 416 if (isTokenizerWhitespace(cc)) { 417 if (isAppropriateEndTag()) { 418 m_temporaryBuffer.append(static_cast<LChar>(cc)); 419 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 420 } 421 } else if (cc == '/') { 422 if (isAppropriateEndTag()) { 423 m_temporaryBuffer.append(static_cast<LChar>(cc)); 424 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 425 } 426 } else if (cc == '>') { 427 if (isAppropriateEndTag()) { 428 m_temporaryBuffer.append(static_cast<LChar>(cc)); 429 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 430 } 431 } 432 bufferCharacter('<'); 433 bufferCharacter('/'); 434 m_token->appendToCharacter(m_temporaryBuffer); 435 m_bufferedEndTagName.clear(); 436 m_temporaryBuffer.clear(); 437 HTML_RECONSUME_IN(RCDATAState); 438 } 439 } 440 END_STATE() 441 442 HTML_BEGIN_STATE(RAWTEXTLessThanSignState) { 443 if (cc == '/') { 444 m_temporaryBuffer.clear(); 445 ASSERT(m_bufferedEndTagName.isEmpty()); 446 HTML_ADVANCE_TO(RAWTEXTEndTagOpenState); 447 } else { 448 bufferCharacter('<'); 449 HTML_RECONSUME_IN(RAWTEXTState); 450 } 451 } 452 END_STATE() 453 454 HTML_BEGIN_STATE(RAWTEXTEndTagOpenState) { 455 if (isASCIIUpper(cc)) { 456 m_temporaryBuffer.append(static_cast<LChar>(cc)); 457 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 458 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 459 } else if (isASCIILower(cc)) { 460 m_temporaryBuffer.append(static_cast<LChar>(cc)); 461 addToPossibleEndTag(static_cast<LChar>(cc)); 462 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 463 } else { 464 bufferCharacter('<'); 465 bufferCharacter('/'); 466 HTML_RECONSUME_IN(RAWTEXTState); 467 } 468 } 469 END_STATE() 470 471 HTML_BEGIN_STATE(RAWTEXTEndTagNameState) { 472 if (isASCIIUpper(cc)) { 473 m_temporaryBuffer.append(static_cast<LChar>(cc)); 474 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 475 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 476 } else if (isASCIILower(cc)) { 477 m_temporaryBuffer.append(static_cast<LChar>(cc)); 478 addToPossibleEndTag(static_cast<LChar>(cc)); 479 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 480 } else { 481 if (isTokenizerWhitespace(cc)) { 482 if (isAppropriateEndTag()) { 483 m_temporaryBuffer.append(static_cast<LChar>(cc)); 484 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 485 } 486 } else if (cc == '/') { 487 if (isAppropriateEndTag()) { 488 m_temporaryBuffer.append(static_cast<LChar>(cc)); 489 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 490 } 491 } else if (cc == '>') { 492 if (isAppropriateEndTag()) { 493 m_temporaryBuffer.append(static_cast<LChar>(cc)); 494 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 495 } 496 } 497 bufferCharacter('<'); 498 bufferCharacter('/'); 499 m_token->appendToCharacter(m_temporaryBuffer); 500 m_bufferedEndTagName.clear(); 501 m_temporaryBuffer.clear(); 502 HTML_RECONSUME_IN(RAWTEXTState); 503 } 504 } 505 END_STATE() 506 507 HTML_BEGIN_STATE(ScriptDataLessThanSignState) { 508 if (cc == '/') { 509 m_temporaryBuffer.clear(); 510 ASSERT(m_bufferedEndTagName.isEmpty()); 511 HTML_ADVANCE_TO(ScriptDataEndTagOpenState); 512 } else if (cc == '!') { 513 bufferCharacter('<'); 514 bufferCharacter('!'); 515 HTML_ADVANCE_TO(ScriptDataEscapeStartState); 516 } else { 517 bufferCharacter('<'); 518 HTML_RECONSUME_IN(ScriptDataState); 519 } 520 } 521 END_STATE() 522 523 HTML_BEGIN_STATE(ScriptDataEndTagOpenState) { 524 if (isASCIIUpper(cc)) { 525 m_temporaryBuffer.append(static_cast<LChar>(cc)); 526 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 527 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 528 } else if (isASCIILower(cc)) { 529 m_temporaryBuffer.append(static_cast<LChar>(cc)); 530 addToPossibleEndTag(static_cast<LChar>(cc)); 531 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 532 } else { 533 bufferCharacter('<'); 534 bufferCharacter('/'); 535 HTML_RECONSUME_IN(ScriptDataState); 536 } 537 } 538 END_STATE() 539 540 HTML_BEGIN_STATE(ScriptDataEndTagNameState) { 541 if (isASCIIUpper(cc)) { 542 m_temporaryBuffer.append(static_cast<LChar>(cc)); 543 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 544 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 545 } else if (isASCIILower(cc)) { 546 m_temporaryBuffer.append(static_cast<LChar>(cc)); 547 addToPossibleEndTag(static_cast<LChar>(cc)); 548 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 549 } else { 550 if (isTokenizerWhitespace(cc)) { 551 if (isAppropriateEndTag()) { 552 m_temporaryBuffer.append(static_cast<LChar>(cc)); 553 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 554 } 555 } else if (cc == '/') { 556 if (isAppropriateEndTag()) { 557 m_temporaryBuffer.append(static_cast<LChar>(cc)); 558 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 559 } 560 } else if (cc == '>') { 561 if (isAppropriateEndTag()) { 562 m_temporaryBuffer.append(static_cast<LChar>(cc)); 563 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 564 } 565 } 566 bufferCharacter('<'); 567 bufferCharacter('/'); 568 m_token->appendToCharacter(m_temporaryBuffer); 569 m_bufferedEndTagName.clear(); 570 m_temporaryBuffer.clear(); 571 HTML_RECONSUME_IN(ScriptDataState); 572 } 573 } 574 END_STATE() 575 576 HTML_BEGIN_STATE(ScriptDataEscapeStartState) { 577 if (cc == '-') { 578 bufferCharacter(cc); 579 HTML_ADVANCE_TO(ScriptDataEscapeStartDashState); 580 } else 581 HTML_RECONSUME_IN(ScriptDataState); 582 } 583 END_STATE() 584 585 HTML_BEGIN_STATE(ScriptDataEscapeStartDashState) { 586 if (cc == '-') { 587 bufferCharacter(cc); 588 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState); 589 } else 590 HTML_RECONSUME_IN(ScriptDataState); 591 } 592 END_STATE() 593 594 HTML_BEGIN_STATE(ScriptDataEscapedState) { 595 if (cc == '-') { 596 bufferCharacter(cc); 597 HTML_ADVANCE_TO(ScriptDataEscapedDashState); 598 } else if (cc == '<') 599 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState); 600 else if (cc == kEndOfFileMarker) { 601 parseError(); 602 HTML_RECONSUME_IN(DataState); 603 } else { 604 bufferCharacter(cc); 605 HTML_ADVANCE_TO(ScriptDataEscapedState); 606 } 607 } 608 END_STATE() 609 610 HTML_BEGIN_STATE(ScriptDataEscapedDashState) { 611 if (cc == '-') { 612 bufferCharacter(cc); 613 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState); 614 } else if (cc == '<') 615 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState); 616 else if (cc == kEndOfFileMarker) { 617 parseError(); 618 HTML_RECONSUME_IN(DataState); 619 } else { 620 bufferCharacter(cc); 621 HTML_ADVANCE_TO(ScriptDataEscapedState); 622 } 623 } 624 END_STATE() 625 626 HTML_BEGIN_STATE(ScriptDataEscapedDashDashState) { 627 if (cc == '-') { 628 bufferCharacter(cc); 629 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState); 630 } else if (cc == '<') 631 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState); 632 else if (cc == '>') { 633 bufferCharacter(cc); 634 HTML_ADVANCE_TO(ScriptDataState); 635 } else if (cc == kEndOfFileMarker) { 636 parseError(); 637 HTML_RECONSUME_IN(DataState); 638 } else { 639 bufferCharacter(cc); 640 HTML_ADVANCE_TO(ScriptDataEscapedState); 641 } 642 } 643 END_STATE() 644 645 HTML_BEGIN_STATE(ScriptDataEscapedLessThanSignState) { 646 if (cc == '/') { 647 m_temporaryBuffer.clear(); 648 ASSERT(m_bufferedEndTagName.isEmpty()); 649 HTML_ADVANCE_TO(ScriptDataEscapedEndTagOpenState); 650 } else if (isASCIIUpper(cc)) { 651 bufferCharacter('<'); 652 bufferCharacter(cc); 653 m_temporaryBuffer.clear(); 654 m_temporaryBuffer.append(toLowerCase(cc)); 655 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 656 } else if (isASCIILower(cc)) { 657 bufferCharacter('<'); 658 bufferCharacter(cc); 659 m_temporaryBuffer.clear(); 660 m_temporaryBuffer.append(static_cast<LChar>(cc)); 661 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 662 } else { 663 bufferCharacter('<'); 664 HTML_RECONSUME_IN(ScriptDataEscapedState); 665 } 666 } 667 END_STATE() 668 669 HTML_BEGIN_STATE(ScriptDataEscapedEndTagOpenState) { 670 if (isASCIIUpper(cc)) { 671 m_temporaryBuffer.append(static_cast<LChar>(cc)); 672 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 673 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 674 } else if (isASCIILower(cc)) { 675 m_temporaryBuffer.append(static_cast<LChar>(cc)); 676 addToPossibleEndTag(static_cast<LChar>(cc)); 677 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 678 } else { 679 bufferCharacter('<'); 680 bufferCharacter('/'); 681 HTML_RECONSUME_IN(ScriptDataEscapedState); 682 } 683 } 684 END_STATE() 685 686 HTML_BEGIN_STATE(ScriptDataEscapedEndTagNameState) { 687 if (isASCIIUpper(cc)) { 688 m_temporaryBuffer.append(static_cast<LChar>(cc)); 689 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 690 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 691 } else if (isASCIILower(cc)) { 692 m_temporaryBuffer.append(static_cast<LChar>(cc)); 693 addToPossibleEndTag(static_cast<LChar>(cc)); 694 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 695 } else { 696 if (isTokenizerWhitespace(cc)) { 697 if (isAppropriateEndTag()) { 698 m_temporaryBuffer.append(static_cast<LChar>(cc)); 699 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 700 } 701 } else if (cc == '/') { 702 if (isAppropriateEndTag()) { 703 m_temporaryBuffer.append(static_cast<LChar>(cc)); 704 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 705 } 706 } else if (cc == '>') { 707 if (isAppropriateEndTag()) { 708 m_temporaryBuffer.append(static_cast<LChar>(cc)); 709 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 710 } 711 } 712 bufferCharacter('<'); 713 bufferCharacter('/'); 714 m_token->appendToCharacter(m_temporaryBuffer); 715 m_bufferedEndTagName.clear(); 716 m_temporaryBuffer.clear(); 717 HTML_RECONSUME_IN(ScriptDataEscapedState); 718 } 719 } 720 END_STATE() 721 722 HTML_BEGIN_STATE(ScriptDataDoubleEscapeStartState) { 723 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') { 724 bufferCharacter(cc); 725 if (temporaryBufferIs(scriptTag.localName())) 726 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 727 else 728 HTML_ADVANCE_TO(ScriptDataEscapedState); 729 } else if (isASCIIUpper(cc)) { 730 bufferCharacter(cc); 731 m_temporaryBuffer.append(toLowerCase(cc)); 732 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 733 } else if (isASCIILower(cc)) { 734 bufferCharacter(cc); 735 m_temporaryBuffer.append(static_cast<LChar>(cc)); 736 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 737 } else 738 HTML_RECONSUME_IN(ScriptDataEscapedState); 739 } 740 END_STATE() 741 742 HTML_BEGIN_STATE(ScriptDataDoubleEscapedState) { 743 if (cc == '-') { 744 bufferCharacter(cc); 745 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashState); 746 } else if (cc == '<') { 747 bufferCharacter(cc); 748 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 749 } else if (cc == kEndOfFileMarker) { 750 parseError(); 751 HTML_RECONSUME_IN(DataState); 752 } else { 753 bufferCharacter(cc); 754 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 755 } 756 } 757 END_STATE() 758 759 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashState) { 760 if (cc == '-') { 761 bufferCharacter(cc); 762 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState); 763 } else if (cc == '<') { 764 bufferCharacter(cc); 765 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 766 } else if (cc == kEndOfFileMarker) { 767 parseError(); 768 HTML_RECONSUME_IN(DataState); 769 } else { 770 bufferCharacter(cc); 771 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 772 } 773 } 774 END_STATE() 775 776 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) { 777 if (cc == '-') { 778 bufferCharacter(cc); 779 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState); 780 } else if (cc == '<') { 781 bufferCharacter(cc); 782 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 783 } else if (cc == '>') { 784 bufferCharacter(cc); 785 HTML_ADVANCE_TO(ScriptDataState); 786 } else if (cc == kEndOfFileMarker) { 787 parseError(); 788 HTML_RECONSUME_IN(DataState); 789 } else { 790 bufferCharacter(cc); 791 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 792 } 793 } 794 END_STATE() 795 796 HTML_BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) { 797 if (cc == '/') { 798 bufferCharacter(cc); 799 m_temporaryBuffer.clear(); 800 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState); 801 } else 802 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState); 803 } 804 END_STATE() 805 806 HTML_BEGIN_STATE(ScriptDataDoubleEscapeEndState) { 807 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') { 808 bufferCharacter(cc); 809 if (temporaryBufferIs(scriptTag.localName())) 810 HTML_ADVANCE_TO(ScriptDataEscapedState); 811 else 812 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 813 } else if (isASCIIUpper(cc)) { 814 bufferCharacter(cc); 815 m_temporaryBuffer.append(toLowerCase(cc)); 816 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState); 817 } else if (isASCIILower(cc)) { 818 bufferCharacter(cc); 819 m_temporaryBuffer.append(static_cast<LChar>(cc)); 820 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState); 821 } else 822 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState); 823 } 824 END_STATE() 825 826 HTML_BEGIN_STATE(BeforeAttributeNameState) { 827 if (isTokenizerWhitespace(cc)) 828 HTML_ADVANCE_TO(BeforeAttributeNameState); 829 else if (cc == '/') 830 HTML_ADVANCE_TO(SelfClosingStartTagState); 831 else if (cc == '>') 832 return emitAndResumeIn(source, HTMLTokenizer::DataState); 833 else if (isASCIIUpper(cc)) { 834 m_token->addNewAttribute(); 835 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 836 m_token->appendToAttributeName(toLowerCase(cc)); 837 HTML_ADVANCE_TO(AttributeNameState); 838 } else if (cc == kEndOfFileMarker) { 839 parseError(); 840 HTML_RECONSUME_IN(DataState); 841 } else { 842 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=') 843 parseError(); 844 m_token->addNewAttribute(); 845 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 846 m_token->appendToAttributeName(cc); 847 HTML_ADVANCE_TO(AttributeNameState); 848 } 849 } 850 END_STATE() 851 852 HTML_BEGIN_STATE(AttributeNameState) { 853 if (isTokenizerWhitespace(cc)) { 854 m_token->endAttributeName(source.numberOfCharactersConsumed()); 855 HTML_ADVANCE_TO(AfterAttributeNameState); 856 } else if (cc == '/') { 857 m_token->endAttributeName(source.numberOfCharactersConsumed()); 858 HTML_ADVANCE_TO(SelfClosingStartTagState); 859 } else if (cc == '=') { 860 m_token->endAttributeName(source.numberOfCharactersConsumed()); 861 HTML_ADVANCE_TO(BeforeAttributeValueState); 862 } else if (cc == '>') { 863 m_token->endAttributeName(source.numberOfCharactersConsumed()); 864 return emitAndResumeIn(source, HTMLTokenizer::DataState); 865 } else if (isASCIIUpper(cc)) { 866 m_token->appendToAttributeName(toLowerCase(cc)); 867 HTML_ADVANCE_TO(AttributeNameState); 868 } else if (cc == kEndOfFileMarker) { 869 parseError(); 870 m_token->endAttributeName(source.numberOfCharactersConsumed()); 871 HTML_RECONSUME_IN(DataState); 872 } else { 873 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=') 874 parseError(); 875 m_token->appendToAttributeName(cc); 876 HTML_ADVANCE_TO(AttributeNameState); 877 } 878 } 879 END_STATE() 880 881 HTML_BEGIN_STATE(AfterAttributeNameState) { 882 if (isTokenizerWhitespace(cc)) 883 HTML_ADVANCE_TO(AfterAttributeNameState); 884 else if (cc == '/') 885 HTML_ADVANCE_TO(SelfClosingStartTagState); 886 else if (cc == '=') 887 HTML_ADVANCE_TO(BeforeAttributeValueState); 888 else if (cc == '>') 889 return emitAndResumeIn(source, HTMLTokenizer::DataState); 890 else if (isASCIIUpper(cc)) { 891 m_token->addNewAttribute(); 892 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 893 m_token->appendToAttributeName(toLowerCase(cc)); 894 HTML_ADVANCE_TO(AttributeNameState); 895 } else if (cc == kEndOfFileMarker) { 896 parseError(); 897 HTML_RECONSUME_IN(DataState); 898 } else { 899 if (cc == '"' || cc == '\'' || cc == '<') 900 parseError(); 901 m_token->addNewAttribute(); 902 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 903 m_token->appendToAttributeName(cc); 904 HTML_ADVANCE_TO(AttributeNameState); 905 } 906 } 907 END_STATE() 908 909 HTML_BEGIN_STATE(BeforeAttributeValueState) { 910 if (isTokenizerWhitespace(cc)) 911 HTML_ADVANCE_TO(BeforeAttributeValueState); 912 else if (cc == '"') { 913 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1); 914 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); 915 } else if (cc == '&') { 916 m_token->beginAttributeValue(source.numberOfCharactersConsumed()); 917 HTML_RECONSUME_IN(AttributeValueUnquotedState); 918 } else if (cc == '\'') { 919 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1); 920 HTML_ADVANCE_TO(AttributeValueSingleQuotedState); 921 } else if (cc == '>') { 922 parseError(); 923 return emitAndResumeIn(source, HTMLTokenizer::DataState); 924 } else if (cc == kEndOfFileMarker) { 925 parseError(); 926 HTML_RECONSUME_IN(DataState); 927 } else { 928 if (cc == '<' || cc == '=' || cc == '`') 929 parseError(); 930 m_token->beginAttributeValue(source.numberOfCharactersConsumed()); 931 m_token->appendToAttributeValue(cc); 932 HTML_ADVANCE_TO(AttributeValueUnquotedState); 933 } 934 } 935 END_STATE() 936 937 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) { 938 if (cc == '"') { 939 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 940 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); 941 } else if (cc == '&') { 942 m_additionalAllowedCharacter = '"'; 943 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 944 } else if (cc == kEndOfFileMarker) { 945 parseError(); 946 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 947 HTML_RECONSUME_IN(DataState); 948 } else { 949 m_token->appendToAttributeValue(cc); 950 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); 951 } 952 } 953 END_STATE() 954 955 HTML_BEGIN_STATE(AttributeValueSingleQuotedState) { 956 if (cc == '\'') { 957 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 958 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); 959 } else if (cc == '&') { 960 m_additionalAllowedCharacter = '\''; 961 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 962 } else if (cc == kEndOfFileMarker) { 963 parseError(); 964 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 965 HTML_RECONSUME_IN(DataState); 966 } else { 967 m_token->appendToAttributeValue(cc); 968 HTML_ADVANCE_TO(AttributeValueSingleQuotedState); 969 } 970 } 971 END_STATE() 972 973 HTML_BEGIN_STATE(AttributeValueUnquotedState) { 974 if (isTokenizerWhitespace(cc)) { 975 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 976 HTML_ADVANCE_TO(BeforeAttributeNameState); 977 } else if (cc == '&') { 978 m_additionalAllowedCharacter = '>'; 979 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 980 } else if (cc == '>') { 981 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 982 return emitAndResumeIn(source, HTMLTokenizer::DataState); 983 } else if (cc == kEndOfFileMarker) { 984 parseError(); 985 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 986 HTML_RECONSUME_IN(DataState); 987 } else { 988 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`') 989 parseError(); 990 m_token->appendToAttributeValue(cc); 991 HTML_ADVANCE_TO(AttributeValueUnquotedState); 992 } 993 } 994 END_STATE() 995 996 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) { 997 bool notEnoughCharacters = false; 998 DecodedHTMLEntity decodedEntity; 999 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter); 1000 if (notEnoughCharacters) 1001 return haveBufferedCharacterToken(); 1002 if (!success) { 1003 ASSERT(decodedEntity.isEmpty()); 1004 m_token->appendToAttributeValue('&'); 1005 } else { 1006 for (unsigned i = 0; i < decodedEntity.length; ++i) 1007 m_token->appendToAttributeValue(decodedEntity.data[i]); 1008 } 1009 // We're supposed to switch back to the attribute value state that 1010 // we were in when we were switched into this state. Rather than 1011 // keeping track of this explictly, we observe that the previous 1012 // state can be determined by m_additionalAllowedCharacter. 1013 if (m_additionalAllowedCharacter == '"') 1014 HTML_SWITCH_TO(AttributeValueDoubleQuotedState); 1015 else if (m_additionalAllowedCharacter == '\'') 1016 HTML_SWITCH_TO(AttributeValueSingleQuotedState); 1017 else if (m_additionalAllowedCharacter == '>') 1018 HTML_SWITCH_TO(AttributeValueUnquotedState); 1019 else 1020 ASSERT_NOT_REACHED(); 1021 } 1022 END_STATE() 1023 1024 HTML_BEGIN_STATE(AfterAttributeValueQuotedState) { 1025 if (isTokenizerWhitespace(cc)) 1026 HTML_ADVANCE_TO(BeforeAttributeNameState); 1027 else if (cc == '/') 1028 HTML_ADVANCE_TO(SelfClosingStartTagState); 1029 else if (cc == '>') 1030 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1031 else if (cc == kEndOfFileMarker) { 1032 parseError(); 1033 HTML_RECONSUME_IN(DataState); 1034 } else { 1035 parseError(); 1036 HTML_RECONSUME_IN(BeforeAttributeNameState); 1037 } 1038 } 1039 END_STATE() 1040 1041 HTML_BEGIN_STATE(SelfClosingStartTagState) { 1042 if (cc == '>') { 1043 m_token->setSelfClosing(); 1044 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1045 } else if (cc == kEndOfFileMarker) { 1046 parseError(); 1047 HTML_RECONSUME_IN(DataState); 1048 } else { 1049 parseError(); 1050 HTML_RECONSUME_IN(BeforeAttributeNameState); 1051 } 1052 } 1053 END_STATE() 1054 1055 HTML_BEGIN_STATE(BogusCommentState) { 1056 m_token->beginComment(); 1057 HTML_RECONSUME_IN(ContinueBogusCommentState); 1058 } 1059 END_STATE() 1060 1061 HTML_BEGIN_STATE(ContinueBogusCommentState) { 1062 if (cc == '>') 1063 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1064 else if (cc == kEndOfFileMarker) 1065 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1066 else { 1067 m_token->appendToComment(cc); 1068 HTML_ADVANCE_TO(ContinueBogusCommentState); 1069 } 1070 } 1071 END_STATE() 1072 1073 HTML_BEGIN_STATE(MarkupDeclarationOpenState) { 1074 DEFINE_STATIC_LOCAL(String, dashDashString, ("--")); 1075 DEFINE_STATIC_LOCAL(String, doctypeString, ("doctype")); 1076 DEFINE_STATIC_LOCAL(String, cdataString, ("[CDATA[")); 1077 if (cc == '-') { 1078 SegmentedString::LookAheadResult result = source.lookAhead(dashDashString); 1079 if (result == SegmentedString::DidMatch) { 1080 source.advanceAndASSERT('-'); 1081 source.advanceAndASSERT('-'); 1082 m_token->beginComment(); 1083 HTML_SWITCH_TO(CommentStartState); 1084 } else if (result == SegmentedString::NotEnoughCharacters) 1085 return haveBufferedCharacterToken(); 1086 } else if (cc == 'D' || cc == 'd') { 1087 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString); 1088 if (result == SegmentedString::DidMatch) { 1089 advanceStringAndASSERTIgnoringCase(source, "doctype"); 1090 HTML_SWITCH_TO(DOCTYPEState); 1091 } else if (result == SegmentedString::NotEnoughCharacters) 1092 return haveBufferedCharacterToken(); 1093 } else if (cc == '[' && shouldAllowCDATA()) { 1094 SegmentedString::LookAheadResult result = source.lookAhead(cdataString); 1095 if (result == SegmentedString::DidMatch) { 1096 advanceStringAndASSERT(source, "[CDATA["); 1097 HTML_SWITCH_TO(CDATASectionState); 1098 } else if (result == SegmentedString::NotEnoughCharacters) 1099 return haveBufferedCharacterToken(); 1100 } 1101 parseError(); 1102 HTML_RECONSUME_IN(BogusCommentState); 1103 } 1104 END_STATE() 1105 1106 HTML_BEGIN_STATE(CommentStartState) { 1107 if (cc == '-') 1108 HTML_ADVANCE_TO(CommentStartDashState); 1109 else if (cc == '>') { 1110 parseError(); 1111 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1112 } else if (cc == kEndOfFileMarker) { 1113 parseError(); 1114 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1115 } else { 1116 m_token->appendToComment(cc); 1117 HTML_ADVANCE_TO(CommentState); 1118 } 1119 } 1120 END_STATE() 1121 1122 HTML_BEGIN_STATE(CommentStartDashState) { 1123 if (cc == '-') 1124 HTML_ADVANCE_TO(CommentEndState); 1125 else if (cc == '>') { 1126 parseError(); 1127 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1128 } else if (cc == kEndOfFileMarker) { 1129 parseError(); 1130 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1131 } else { 1132 m_token->appendToComment('-'); 1133 m_token->appendToComment(cc); 1134 HTML_ADVANCE_TO(CommentState); 1135 } 1136 } 1137 END_STATE() 1138 1139 HTML_BEGIN_STATE(CommentState) { 1140 if (cc == '-') 1141 HTML_ADVANCE_TO(CommentEndDashState); 1142 else if (cc == kEndOfFileMarker) { 1143 parseError(); 1144 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1145 } else { 1146 m_token->appendToComment(cc); 1147 HTML_ADVANCE_TO(CommentState); 1148 } 1149 } 1150 END_STATE() 1151 1152 HTML_BEGIN_STATE(CommentEndDashState) { 1153 if (cc == '-') 1154 HTML_ADVANCE_TO(CommentEndState); 1155 else if (cc == kEndOfFileMarker) { 1156 parseError(); 1157 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1158 } else { 1159 m_token->appendToComment('-'); 1160 m_token->appendToComment(cc); 1161 HTML_ADVANCE_TO(CommentState); 1162 } 1163 } 1164 END_STATE() 1165 1166 HTML_BEGIN_STATE(CommentEndState) { 1167 if (cc == '>') 1168 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1169 else if (cc == '!') { 1170 parseError(); 1171 HTML_ADVANCE_TO(CommentEndBangState); 1172 } else if (cc == '-') { 1173 parseError(); 1174 m_token->appendToComment('-'); 1175 HTML_ADVANCE_TO(CommentEndState); 1176 } else if (cc == kEndOfFileMarker) { 1177 parseError(); 1178 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1179 } else { 1180 parseError(); 1181 m_token->appendToComment('-'); 1182 m_token->appendToComment('-'); 1183 m_token->appendToComment(cc); 1184 HTML_ADVANCE_TO(CommentState); 1185 } 1186 } 1187 END_STATE() 1188 1189 HTML_BEGIN_STATE(CommentEndBangState) { 1190 if (cc == '-') { 1191 m_token->appendToComment('-'); 1192 m_token->appendToComment('-'); 1193 m_token->appendToComment('!'); 1194 HTML_ADVANCE_TO(CommentEndDashState); 1195 } else if (cc == '>') 1196 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1197 else if (cc == kEndOfFileMarker) { 1198 parseError(); 1199 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1200 } else { 1201 m_token->appendToComment('-'); 1202 m_token->appendToComment('-'); 1203 m_token->appendToComment('!'); 1204 m_token->appendToComment(cc); 1205 HTML_ADVANCE_TO(CommentState); 1206 } 1207 } 1208 END_STATE() 1209 1210 HTML_BEGIN_STATE(DOCTYPEState) { 1211 if (isTokenizerWhitespace(cc)) 1212 HTML_ADVANCE_TO(BeforeDOCTYPENameState); 1213 else if (cc == kEndOfFileMarker) { 1214 parseError(); 1215 m_token->beginDOCTYPE(); 1216 m_token->setForceQuirks(); 1217 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1218 } else { 1219 parseError(); 1220 HTML_RECONSUME_IN(BeforeDOCTYPENameState); 1221 } 1222 } 1223 END_STATE() 1224 1225 HTML_BEGIN_STATE(BeforeDOCTYPENameState) { 1226 if (isTokenizerWhitespace(cc)) 1227 HTML_ADVANCE_TO(BeforeDOCTYPENameState); 1228 else if (isASCIIUpper(cc)) { 1229 m_token->beginDOCTYPE(toLowerCase(cc)); 1230 HTML_ADVANCE_TO(DOCTYPENameState); 1231 } else if (cc == '>') { 1232 parseError(); 1233 m_token->beginDOCTYPE(); 1234 m_token->setForceQuirks(); 1235 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1236 } else if (cc == kEndOfFileMarker) { 1237 parseError(); 1238 m_token->beginDOCTYPE(); 1239 m_token->setForceQuirks(); 1240 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1241 } else { 1242 m_token->beginDOCTYPE(cc); 1243 HTML_ADVANCE_TO(DOCTYPENameState); 1244 } 1245 } 1246 END_STATE() 1247 1248 HTML_BEGIN_STATE(DOCTYPENameState) { 1249 if (isTokenizerWhitespace(cc)) 1250 HTML_ADVANCE_TO(AfterDOCTYPENameState); 1251 else if (cc == '>') 1252 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1253 else if (isASCIIUpper(cc)) { 1254 m_token->appendToName(toLowerCase(cc)); 1255 HTML_ADVANCE_TO(DOCTYPENameState); 1256 } else if (cc == kEndOfFileMarker) { 1257 parseError(); 1258 m_token->setForceQuirks(); 1259 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1260 } else { 1261 m_token->appendToName(cc); 1262 HTML_ADVANCE_TO(DOCTYPENameState); 1263 } 1264 } 1265 END_STATE() 1266 1267 HTML_BEGIN_STATE(AfterDOCTYPENameState) { 1268 if (isTokenizerWhitespace(cc)) 1269 HTML_ADVANCE_TO(AfterDOCTYPENameState); 1270 if (cc == '>') 1271 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1272 else if (cc == kEndOfFileMarker) { 1273 parseError(); 1274 m_token->setForceQuirks(); 1275 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1276 } else { 1277 DEFINE_STATIC_LOCAL(String, publicString, ("public")); 1278 DEFINE_STATIC_LOCAL(String, systemString, ("system")); 1279 if (cc == 'P' || cc == 'p') { 1280 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString); 1281 if (result == SegmentedString::DidMatch) { 1282 advanceStringAndASSERTIgnoringCase(source, "public"); 1283 HTML_SWITCH_TO(AfterDOCTYPEPublicKeywordState); 1284 } else if (result == SegmentedString::NotEnoughCharacters) 1285 return haveBufferedCharacterToken(); 1286 } else if (cc == 'S' || cc == 's') { 1287 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString); 1288 if (result == SegmentedString::DidMatch) { 1289 advanceStringAndASSERTIgnoringCase(source, "system"); 1290 HTML_SWITCH_TO(AfterDOCTYPESystemKeywordState); 1291 } else if (result == SegmentedString::NotEnoughCharacters) 1292 return haveBufferedCharacterToken(); 1293 } 1294 parseError(); 1295 m_token->setForceQuirks(); 1296 HTML_ADVANCE_TO(BogusDOCTYPEState); 1297 } 1298 } 1299 END_STATE() 1300 1301 HTML_BEGIN_STATE(AfterDOCTYPEPublicKeywordState) { 1302 if (isTokenizerWhitespace(cc)) 1303 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState); 1304 else if (cc == '"') { 1305 parseError(); 1306 m_token->setPublicIdentifierToEmptyString(); 1307 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1308 } else if (cc == '\'') { 1309 parseError(); 1310 m_token->setPublicIdentifierToEmptyString(); 1311 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1312 } else if (cc == '>') { 1313 parseError(); 1314 m_token->setForceQuirks(); 1315 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1316 } else if (cc == kEndOfFileMarker) { 1317 parseError(); 1318 m_token->setForceQuirks(); 1319 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1320 } else { 1321 parseError(); 1322 m_token->setForceQuirks(); 1323 HTML_ADVANCE_TO(BogusDOCTYPEState); 1324 } 1325 } 1326 END_STATE() 1327 1328 HTML_BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) { 1329 if (isTokenizerWhitespace(cc)) 1330 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState); 1331 else if (cc == '"') { 1332 m_token->setPublicIdentifierToEmptyString(); 1333 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1334 } else if (cc == '\'') { 1335 m_token->setPublicIdentifierToEmptyString(); 1336 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1337 } else if (cc == '>') { 1338 parseError(); 1339 m_token->setForceQuirks(); 1340 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1341 } else if (cc == kEndOfFileMarker) { 1342 parseError(); 1343 m_token->setForceQuirks(); 1344 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1345 } else { 1346 parseError(); 1347 m_token->setForceQuirks(); 1348 HTML_ADVANCE_TO(BogusDOCTYPEState); 1349 } 1350 } 1351 END_STATE() 1352 1353 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) { 1354 if (cc == '"') 1355 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState); 1356 else if (cc == '>') { 1357 parseError(); 1358 m_token->setForceQuirks(); 1359 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1360 } else if (cc == kEndOfFileMarker) { 1361 parseError(); 1362 m_token->setForceQuirks(); 1363 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1364 } else { 1365 m_token->appendToPublicIdentifier(cc); 1366 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1367 } 1368 } 1369 END_STATE() 1370 1371 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) { 1372 if (cc == '\'') 1373 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState); 1374 else if (cc == '>') { 1375 parseError(); 1376 m_token->setForceQuirks(); 1377 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1378 } else if (cc == kEndOfFileMarker) { 1379 parseError(); 1380 m_token->setForceQuirks(); 1381 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1382 } else { 1383 m_token->appendToPublicIdentifier(cc); 1384 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1385 } 1386 } 1387 END_STATE() 1388 1389 HTML_BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) { 1390 if (isTokenizerWhitespace(cc)) 1391 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState); 1392 else if (cc == '>') 1393 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1394 else if (cc == '"') { 1395 parseError(); 1396 m_token->setSystemIdentifierToEmptyString(); 1397 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1398 } else if (cc == '\'') { 1399 parseError(); 1400 m_token->setSystemIdentifierToEmptyString(); 1401 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1402 } else if (cc == kEndOfFileMarker) { 1403 parseError(); 1404 m_token->setForceQuirks(); 1405 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1406 } else { 1407 parseError(); 1408 m_token->setForceQuirks(); 1409 HTML_ADVANCE_TO(BogusDOCTYPEState); 1410 } 1411 } 1412 END_STATE() 1413 1414 HTML_BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) { 1415 if (isTokenizerWhitespace(cc)) 1416 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState); 1417 else if (cc == '>') 1418 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1419 else if (cc == '"') { 1420 m_token->setSystemIdentifierToEmptyString(); 1421 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1422 } else if (cc == '\'') { 1423 m_token->setSystemIdentifierToEmptyString(); 1424 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1425 } else if (cc == kEndOfFileMarker) { 1426 parseError(); 1427 m_token->setForceQuirks(); 1428 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1429 } else { 1430 parseError(); 1431 m_token->setForceQuirks(); 1432 HTML_ADVANCE_TO(BogusDOCTYPEState); 1433 } 1434 } 1435 END_STATE() 1436 1437 HTML_BEGIN_STATE(AfterDOCTYPESystemKeywordState) { 1438 if (isTokenizerWhitespace(cc)) 1439 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState); 1440 else if (cc == '"') { 1441 parseError(); 1442 m_token->setSystemIdentifierToEmptyString(); 1443 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1444 } else if (cc == '\'') { 1445 parseError(); 1446 m_token->setSystemIdentifierToEmptyString(); 1447 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1448 } else if (cc == '>') { 1449 parseError(); 1450 m_token->setForceQuirks(); 1451 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1452 } else if (cc == kEndOfFileMarker) { 1453 parseError(); 1454 m_token->setForceQuirks(); 1455 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1456 } else { 1457 parseError(); 1458 m_token->setForceQuirks(); 1459 HTML_ADVANCE_TO(BogusDOCTYPEState); 1460 } 1461 } 1462 END_STATE() 1463 1464 HTML_BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) { 1465 if (isTokenizerWhitespace(cc)) 1466 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState); 1467 if (cc == '"') { 1468 m_token->setSystemIdentifierToEmptyString(); 1469 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1470 } else if (cc == '\'') { 1471 m_token->setSystemIdentifierToEmptyString(); 1472 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1473 } else if (cc == '>') { 1474 parseError(); 1475 m_token->setForceQuirks(); 1476 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1477 } else if (cc == kEndOfFileMarker) { 1478 parseError(); 1479 m_token->setForceQuirks(); 1480 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1481 } else { 1482 parseError(); 1483 m_token->setForceQuirks(); 1484 HTML_ADVANCE_TO(BogusDOCTYPEState); 1485 } 1486 } 1487 END_STATE() 1488 1489 HTML_BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) { 1490 if (cc == '"') 1491 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1492 else if (cc == '>') { 1493 parseError(); 1494 m_token->setForceQuirks(); 1495 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1496 } else if (cc == kEndOfFileMarker) { 1497 parseError(); 1498 m_token->setForceQuirks(); 1499 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1500 } else { 1501 m_token->appendToSystemIdentifier(cc); 1502 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1503 } 1504 } 1505 END_STATE() 1506 1507 HTML_BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) { 1508 if (cc == '\'') 1509 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1510 else if (cc == '>') { 1511 parseError(); 1512 m_token->setForceQuirks(); 1513 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1514 } else if (cc == kEndOfFileMarker) { 1515 parseError(); 1516 m_token->setForceQuirks(); 1517 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1518 } else { 1519 m_token->appendToSystemIdentifier(cc); 1520 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1521 } 1522 } 1523 END_STATE() 1524 1525 HTML_BEGIN_STATE(AfterDOCTYPESystemIdentifierState) { 1526 if (isTokenizerWhitespace(cc)) 1527 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1528 else if (cc == '>') 1529 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1530 else if (cc == kEndOfFileMarker) { 1531 parseError(); 1532 m_token->setForceQuirks(); 1533 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1534 } else { 1535 parseError(); 1536 HTML_ADVANCE_TO(BogusDOCTYPEState); 1537 } 1538 } 1539 END_STATE() 1540 1541 HTML_BEGIN_STATE(BogusDOCTYPEState) { 1542 if (cc == '>') 1543 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1544 else if (cc == kEndOfFileMarker) 1545 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1546 HTML_ADVANCE_TO(BogusDOCTYPEState); 1547 } 1548 END_STATE() 1549 1550 HTML_BEGIN_STATE(CDATASectionState) { 1551 if (cc == ']') 1552 HTML_ADVANCE_TO(CDATASectionRightSquareBracketState); 1553 else if (cc == kEndOfFileMarker) 1554 HTML_RECONSUME_IN(DataState); 1555 else { 1556 bufferCharacter(cc); 1557 HTML_ADVANCE_TO(CDATASectionState); 1558 } 1559 } 1560 END_STATE() 1561 1562 HTML_BEGIN_STATE(CDATASectionRightSquareBracketState) { 1563 if (cc == ']') 1564 HTML_ADVANCE_TO(CDATASectionDoubleRightSquareBracketState); 1565 else { 1566 bufferCharacter(']'); 1567 HTML_RECONSUME_IN(CDATASectionState); 1568 } 1569 } 1570 1571 HTML_BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) { 1572 if (cc == '>') 1573 HTML_ADVANCE_TO(DataState); 1574 else { 1575 bufferCharacter(']'); 1576 bufferCharacter(']'); 1577 HTML_RECONSUME_IN(CDATASectionState); 1578 } 1579 } 1580 END_STATE() 1581 1582 } 1583 1584 ASSERT_NOT_REACHED(); 1585 return false; 1586 } 1587 1588 String HTMLTokenizer::bufferedCharacters() const 1589 { 1590 // FIXME: Add an assert about m_state. 1591 StringBuilder characters; 1592 characters.reserveCapacity(numberOfBufferedCharacters()); 1593 characters.append('<'); 1594 characters.append('/'); 1595 characters.append(m_temporaryBuffer.data(), m_temporaryBuffer.size()); 1596 return characters.toString(); 1597 } 1598 1599 void HTMLTokenizer::updateStateFor(const AtomicString& tagName) 1600 { 1601 if (tagName == textareaTag || tagName == titleTag) 1602 setState(HTMLTokenizer::RCDATAState); 1603 else if (tagName == plaintextTag) 1604 setState(HTMLTokenizer::PLAINTEXTState); 1605 else if (tagName == scriptTag) 1606 setState(HTMLTokenizer::ScriptDataState); 1607 else if (tagName == styleTag 1608 || tagName == iframeTag 1609 || tagName == xmpTag 1610 || (tagName == noembedTag && m_options.pluginsEnabled) 1611 || tagName == noframesTag 1612 || (tagName == noscriptTag && m_options.scriptEnabled)) 1613 setState(HTMLTokenizer::RAWTEXTState); 1614 } 1615 1616 inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString) 1617 { 1618 return vectorEqualsString(m_temporaryBuffer, expectedString); 1619 } 1620 1621 inline void HTMLTokenizer::addToPossibleEndTag(LChar cc) 1622 { 1623 ASSERT(isEndTagBufferingState(m_state)); 1624 m_bufferedEndTagName.append(cc); 1625 } 1626 1627 inline bool HTMLTokenizer::isAppropriateEndTag() 1628 { 1629 if (m_bufferedEndTagName.size() != m_appropriateEndTagName.size()) 1630 return false; 1631 1632 size_t numCharacters = m_bufferedEndTagName.size(); 1633 1634 for (size_t i = 0; i < numCharacters; i++) { 1635 if (m_bufferedEndTagName[i] != m_appropriateEndTagName[i]) 1636 return false; 1637 } 1638 1639 return true; 1640 } 1641 1642 inline void HTMLTokenizer::parseError() 1643 { 1644 notImplemented(); 1645 } 1646 1647 } 1648