1 /* 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ 4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 #include "core/html/parser/HTMLTokenizer.h" 30 31 #include "HTMLNames.h" 32 #include "core/html/parser/HTMLEntityParser.h" 33 #include "core/html/parser/HTMLToken.h" 34 #include "core/html/parser/HTMLTreeBuilder.h" 35 #include "core/platform/NotImplemented.h" 36 #include "core/xml/parser/MarkupTokenizerInlines.h" 37 #include "wtf/ASCIICType.h" 38 #include "wtf/text/AtomicString.h" 39 #include "wtf/unicode/Unicode.h" 40 41 using namespace WTF; 42 43 namespace WebCore { 44 45 using namespace HTMLNames; 46 47 // This has to go in a .cpp file, as the linker doesn't like it being included more than once. 48 // We don't have an HTMLToken.cpp though, so this is the next best place. 49 QualifiedName AtomicHTMLToken::nameForAttribute(const HTMLToken::Attribute& attribute) const 50 { 51 return QualifiedName(nullAtom, AtomicString(attribute.name), nullAtom); 52 } 53 54 bool AtomicHTMLToken::usesName() const 55 { 56 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE; 57 } 58 59 bool AtomicHTMLToken::usesAttributes() const 60 { 61 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag; 62 } 63 64 static inline UChar toLowerCase(UChar cc) 65 { 66 ASSERT(isASCIIUpper(cc)); 67 const int lowerCaseOffset = 0x20; 68 return cc + lowerCaseOffset; 69 } 70 71 static inline bool vectorEqualsString(const Vector<LChar, 32>& vector, const String& string) 72 { 73 if (vector.size() != string.length()) 74 return false; 75 76 if (!string.length()) 77 return true; 78 79 return equal(string.impl(), vector.data(), vector.size()); 80 } 81 82 static inline bool isEndTagBufferingState(HTMLTokenizer::State state) 83 { 84 switch (state) { 85 case HTMLTokenizer::RCDATAEndTagOpenState: 86 case HTMLTokenizer::RCDATAEndTagNameState: 87 case HTMLTokenizer::RAWTEXTEndTagOpenState: 88 case HTMLTokenizer::RAWTEXTEndTagNameState: 89 case HTMLTokenizer::ScriptDataEndTagOpenState: 90 case HTMLTokenizer::ScriptDataEndTagNameState: 91 case HTMLTokenizer::ScriptDataEscapedEndTagOpenState: 92 case HTMLTokenizer::ScriptDataEscapedEndTagNameState: 93 return true; 94 default: 95 return false; 96 } 97 } 98 99 #define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizer, stateName) 100 #define HTML_RECONSUME_IN(stateName) RECONSUME_IN(HTMLTokenizer, stateName) 101 #define HTML_ADVANCE_TO(stateName) ADVANCE_TO(HTMLTokenizer, stateName) 102 #define HTML_SWITCH_TO(stateName) SWITCH_TO(HTMLTokenizer, stateName) 103 104 HTMLTokenizer::HTMLTokenizer(const HTMLParserOptions& options) 105 : m_inputStreamPreprocessor(this) 106 , m_options(options) 107 { 108 reset(); 109 } 110 111 HTMLTokenizer::~HTMLTokenizer() 112 { 113 } 114 115 void HTMLTokenizer::reset() 116 { 117 m_state = HTMLTokenizer::DataState; 118 m_token = 0; 119 m_forceNullCharacterReplacement = false; 120 m_shouldAllowCDATA = false; 121 m_additionalAllowedCharacter = '\0'; 122 } 123 124 bool HTMLTokenizer::canCreateCheckpoint() const 125 { 126 if (!m_appropriateEndTagName.isEmpty()) 127 return false; 128 if (!m_temporaryBuffer.isEmpty()) 129 return false; 130 if (!m_bufferedEndTagName.isEmpty()) 131 return false; 132 return true; 133 } 134 135 void HTMLTokenizer::createCheckpoint(Checkpoint& result) const 136 { 137 ASSERT(canCreateCheckpoint()); 138 result.options = m_options; 139 result.state = m_state; 140 result.additionalAllowedCharacter = m_additionalAllowedCharacter; 141 result.skipNextNewLine = m_inputStreamPreprocessor.skipNextNewLine(); 142 result.shouldAllowCDATA = m_shouldAllowCDATA; 143 } 144 145 void HTMLTokenizer::restoreFromCheckpoint(const Checkpoint& checkpoint) 146 { 147 m_token = 0; 148 m_options = checkpoint.options; 149 m_state = checkpoint.state; 150 m_additionalAllowedCharacter = checkpoint.additionalAllowedCharacter; 151 m_inputStreamPreprocessor.reset(checkpoint.skipNextNewLine); 152 m_shouldAllowCDATA = checkpoint.shouldAllowCDATA; 153 } 154 155 inline bool HTMLTokenizer::processEntity(SegmentedString& source) 156 { 157 bool notEnoughCharacters = false; 158 DecodedHTMLEntity decodedEntity; 159 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters); 160 if (notEnoughCharacters) 161 return false; 162 if (!success) { 163 ASSERT(decodedEntity.isEmpty()); 164 bufferCharacter('&'); 165 } else { 166 for (unsigned i = 0; i < decodedEntity.length; ++i) 167 bufferCharacter(decodedEntity.data[i]); 168 } 169 return true; 170 } 171 172 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source) 173 { 174 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLToken::Uninitialized); 175 source.advanceAndUpdateLineNumber(); 176 if (m_token->type() == HTMLToken::Character) 177 return true; 178 m_token->beginEndTag(m_bufferedEndTagName); 179 m_bufferedEndTagName.clear(); 180 m_appropriateEndTagName.clear(); 181 m_temporaryBuffer.clear(); 182 return false; 183 } 184 185 #define FLUSH_AND_ADVANCE_TO(stateName) \ 186 do { \ 187 m_state = HTMLTokenizer::stateName; \ 188 if (flushBufferedEndTag(source)) \ 189 return true; \ 190 if (source.isEmpty() \ 191 || !m_inputStreamPreprocessor.peek(source)) \ 192 return haveBufferedCharacterToken(); \ 193 cc = m_inputStreamPreprocessor.nextInputCharacter(); \ 194 goto stateName; \ 195 } while (false) 196 197 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer::State state) 198 { 199 m_state = state; 200 flushBufferedEndTag(source); 201 return true; 202 } 203 204 bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token) 205 { 206 // If we have a token in progress, then we're supposed to be called back 207 // with the same token so we can finish it. 208 ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized); 209 m_token = &token; 210 211 if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) { 212 // FIXME: This should call flushBufferedEndTag(). 213 // We started an end tag during our last iteration. 214 m_token->beginEndTag(m_bufferedEndTagName); 215 m_bufferedEndTagName.clear(); 216 m_appropriateEndTagName.clear(); 217 m_temporaryBuffer.clear(); 218 if (m_state == HTMLTokenizer::DataState) { 219 // We're back in the data state, so we must be done with the tag. 220 return true; 221 } 222 } 223 224 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source)) 225 return haveBufferedCharacterToken(); 226 UChar cc = m_inputStreamPreprocessor.nextInputCharacter(); 227 228 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0 229 switch (m_state) { 230 HTML_BEGIN_STATE(DataState) { 231 if (cc == '&') 232 HTML_ADVANCE_TO(CharacterReferenceInDataState); 233 else if (cc == '<') { 234 if (m_token->type() == HTMLToken::Character) { 235 // We have a bunch of character tokens queued up that we 236 // are emitting lazily here. 237 return true; 238 } 239 HTML_ADVANCE_TO(TagOpenState); 240 } else if (cc == kEndOfFileMarker) 241 return emitEndOfFile(source); 242 else { 243 bufferCharacter(cc); 244 HTML_ADVANCE_TO(DataState); 245 } 246 } 247 END_STATE() 248 249 HTML_BEGIN_STATE(CharacterReferenceInDataState) { 250 if (!processEntity(source)) 251 return haveBufferedCharacterToken(); 252 HTML_SWITCH_TO(DataState); 253 } 254 END_STATE() 255 256 HTML_BEGIN_STATE(RCDATAState) { 257 if (cc == '&') 258 HTML_ADVANCE_TO(CharacterReferenceInRCDATAState); 259 else if (cc == '<') 260 HTML_ADVANCE_TO(RCDATALessThanSignState); 261 else if (cc == kEndOfFileMarker) 262 return emitEndOfFile(source); 263 else { 264 bufferCharacter(cc); 265 HTML_ADVANCE_TO(RCDATAState); 266 } 267 } 268 END_STATE() 269 270 HTML_BEGIN_STATE(CharacterReferenceInRCDATAState) { 271 if (!processEntity(source)) 272 return haveBufferedCharacterToken(); 273 HTML_SWITCH_TO(RCDATAState); 274 } 275 END_STATE() 276 277 HTML_BEGIN_STATE(RAWTEXTState) { 278 if (cc == '<') 279 HTML_ADVANCE_TO(RAWTEXTLessThanSignState); 280 else if (cc == kEndOfFileMarker) 281 return emitEndOfFile(source); 282 else { 283 bufferCharacter(cc); 284 HTML_ADVANCE_TO(RAWTEXTState); 285 } 286 } 287 END_STATE() 288 289 HTML_BEGIN_STATE(ScriptDataState) { 290 if (cc == '<') 291 HTML_ADVANCE_TO(ScriptDataLessThanSignState); 292 else if (cc == kEndOfFileMarker) 293 return emitEndOfFile(source); 294 else { 295 bufferCharacter(cc); 296 HTML_ADVANCE_TO(ScriptDataState); 297 } 298 } 299 END_STATE() 300 301 HTML_BEGIN_STATE(PLAINTEXTState) { 302 if (cc == kEndOfFileMarker) 303 return emitEndOfFile(source); 304 bufferCharacter(cc); 305 HTML_ADVANCE_TO(PLAINTEXTState); 306 } 307 END_STATE() 308 309 HTML_BEGIN_STATE(TagOpenState) { 310 if (cc == '!') 311 HTML_ADVANCE_TO(MarkupDeclarationOpenState); 312 else if (cc == '/') 313 HTML_ADVANCE_TO(EndTagOpenState); 314 else if (isASCIIUpper(cc)) { 315 m_token->beginStartTag(toLowerCase(cc)); 316 HTML_ADVANCE_TO(TagNameState); 317 } else if (isASCIILower(cc)) { 318 m_token->beginStartTag(cc); 319 HTML_ADVANCE_TO(TagNameState); 320 } else if (cc == '?') { 321 parseError(); 322 // The spec consumes the current character before switching 323 // to the bogus comment state, but it's easier to implement 324 // if we reconsume the current character. 325 HTML_RECONSUME_IN(BogusCommentState); 326 } else { 327 parseError(); 328 bufferCharacter('<'); 329 HTML_RECONSUME_IN(DataState); 330 } 331 } 332 END_STATE() 333 334 HTML_BEGIN_STATE(EndTagOpenState) { 335 if (isASCIIUpper(cc)) { 336 m_token->beginEndTag(static_cast<LChar>(toLowerCase(cc))); 337 m_appropriateEndTagName.clear(); 338 HTML_ADVANCE_TO(TagNameState); 339 } else if (isASCIILower(cc)) { 340 m_token->beginEndTag(static_cast<LChar>(cc)); 341 m_appropriateEndTagName.clear(); 342 HTML_ADVANCE_TO(TagNameState); 343 } else if (cc == '>') { 344 parseError(); 345 HTML_ADVANCE_TO(DataState); 346 } else if (cc == kEndOfFileMarker) { 347 parseError(); 348 bufferCharacter('<'); 349 bufferCharacter('/'); 350 HTML_RECONSUME_IN(DataState); 351 } else { 352 parseError(); 353 HTML_RECONSUME_IN(BogusCommentState); 354 } 355 } 356 END_STATE() 357 358 HTML_BEGIN_STATE(TagNameState) { 359 if (isTokenizerWhitespace(cc)) 360 HTML_ADVANCE_TO(BeforeAttributeNameState); 361 else if (cc == '/') 362 HTML_ADVANCE_TO(SelfClosingStartTagState); 363 else if (cc == '>') 364 return emitAndResumeIn(source, HTMLTokenizer::DataState); 365 else if (isASCIIUpper(cc)) { 366 m_token->appendToName(toLowerCase(cc)); 367 HTML_ADVANCE_TO(TagNameState); 368 } else if (cc == kEndOfFileMarker) { 369 parseError(); 370 HTML_RECONSUME_IN(DataState); 371 } else { 372 m_token->appendToName(cc); 373 HTML_ADVANCE_TO(TagNameState); 374 } 375 } 376 END_STATE() 377 378 HTML_BEGIN_STATE(RCDATALessThanSignState) { 379 if (cc == '/') { 380 m_temporaryBuffer.clear(); 381 ASSERT(m_bufferedEndTagName.isEmpty()); 382 HTML_ADVANCE_TO(RCDATAEndTagOpenState); 383 } else { 384 bufferCharacter('<'); 385 HTML_RECONSUME_IN(RCDATAState); 386 } 387 } 388 END_STATE() 389 390 HTML_BEGIN_STATE(RCDATAEndTagOpenState) { 391 if (isASCIIUpper(cc)) { 392 m_temporaryBuffer.append(static_cast<LChar>(cc)); 393 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 394 HTML_ADVANCE_TO(RCDATAEndTagNameState); 395 } else if (isASCIILower(cc)) { 396 m_temporaryBuffer.append(static_cast<LChar>(cc)); 397 addToPossibleEndTag(static_cast<LChar>(cc)); 398 HTML_ADVANCE_TO(RCDATAEndTagNameState); 399 } else { 400 bufferCharacter('<'); 401 bufferCharacter('/'); 402 HTML_RECONSUME_IN(RCDATAState); 403 } 404 } 405 END_STATE() 406 407 HTML_BEGIN_STATE(RCDATAEndTagNameState) { 408 if (isASCIIUpper(cc)) { 409 m_temporaryBuffer.append(static_cast<LChar>(cc)); 410 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 411 HTML_ADVANCE_TO(RCDATAEndTagNameState); 412 } else if (isASCIILower(cc)) { 413 m_temporaryBuffer.append(static_cast<LChar>(cc)); 414 addToPossibleEndTag(static_cast<LChar>(cc)); 415 HTML_ADVANCE_TO(RCDATAEndTagNameState); 416 } else { 417 if (isTokenizerWhitespace(cc)) { 418 if (isAppropriateEndTag()) { 419 m_temporaryBuffer.append(static_cast<LChar>(cc)); 420 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 421 } 422 } else if (cc == '/') { 423 if (isAppropriateEndTag()) { 424 m_temporaryBuffer.append(static_cast<LChar>(cc)); 425 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 426 } 427 } else if (cc == '>') { 428 if (isAppropriateEndTag()) { 429 m_temporaryBuffer.append(static_cast<LChar>(cc)); 430 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 431 } 432 } 433 bufferCharacter('<'); 434 bufferCharacter('/'); 435 m_token->appendToCharacter(m_temporaryBuffer); 436 m_bufferedEndTagName.clear(); 437 m_temporaryBuffer.clear(); 438 HTML_RECONSUME_IN(RCDATAState); 439 } 440 } 441 END_STATE() 442 443 HTML_BEGIN_STATE(RAWTEXTLessThanSignState) { 444 if (cc == '/') { 445 m_temporaryBuffer.clear(); 446 ASSERT(m_bufferedEndTagName.isEmpty()); 447 HTML_ADVANCE_TO(RAWTEXTEndTagOpenState); 448 } else { 449 bufferCharacter('<'); 450 HTML_RECONSUME_IN(RAWTEXTState); 451 } 452 } 453 END_STATE() 454 455 HTML_BEGIN_STATE(RAWTEXTEndTagOpenState) { 456 if (isASCIIUpper(cc)) { 457 m_temporaryBuffer.append(static_cast<LChar>(cc)); 458 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 459 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 460 } else if (isASCIILower(cc)) { 461 m_temporaryBuffer.append(static_cast<LChar>(cc)); 462 addToPossibleEndTag(static_cast<LChar>(cc)); 463 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 464 } else { 465 bufferCharacter('<'); 466 bufferCharacter('/'); 467 HTML_RECONSUME_IN(RAWTEXTState); 468 } 469 } 470 END_STATE() 471 472 HTML_BEGIN_STATE(RAWTEXTEndTagNameState) { 473 if (isASCIIUpper(cc)) { 474 m_temporaryBuffer.append(static_cast<LChar>(cc)); 475 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 476 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 477 } else if (isASCIILower(cc)) { 478 m_temporaryBuffer.append(static_cast<LChar>(cc)); 479 addToPossibleEndTag(static_cast<LChar>(cc)); 480 HTML_ADVANCE_TO(RAWTEXTEndTagNameState); 481 } else { 482 if (isTokenizerWhitespace(cc)) { 483 if (isAppropriateEndTag()) { 484 m_temporaryBuffer.append(static_cast<LChar>(cc)); 485 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 486 } 487 } else if (cc == '/') { 488 if (isAppropriateEndTag()) { 489 m_temporaryBuffer.append(static_cast<LChar>(cc)); 490 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 491 } 492 } else if (cc == '>') { 493 if (isAppropriateEndTag()) { 494 m_temporaryBuffer.append(static_cast<LChar>(cc)); 495 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 496 } 497 } 498 bufferCharacter('<'); 499 bufferCharacter('/'); 500 m_token->appendToCharacter(m_temporaryBuffer); 501 m_bufferedEndTagName.clear(); 502 m_temporaryBuffer.clear(); 503 HTML_RECONSUME_IN(RAWTEXTState); 504 } 505 } 506 END_STATE() 507 508 HTML_BEGIN_STATE(ScriptDataLessThanSignState) { 509 if (cc == '/') { 510 m_temporaryBuffer.clear(); 511 ASSERT(m_bufferedEndTagName.isEmpty()); 512 HTML_ADVANCE_TO(ScriptDataEndTagOpenState); 513 } else if (cc == '!') { 514 bufferCharacter('<'); 515 bufferCharacter('!'); 516 HTML_ADVANCE_TO(ScriptDataEscapeStartState); 517 } else { 518 bufferCharacter('<'); 519 HTML_RECONSUME_IN(ScriptDataState); 520 } 521 } 522 END_STATE() 523 524 HTML_BEGIN_STATE(ScriptDataEndTagOpenState) { 525 if (isASCIIUpper(cc)) { 526 m_temporaryBuffer.append(static_cast<LChar>(cc)); 527 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 528 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 529 } else if (isASCIILower(cc)) { 530 m_temporaryBuffer.append(static_cast<LChar>(cc)); 531 addToPossibleEndTag(static_cast<LChar>(cc)); 532 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 533 } else { 534 bufferCharacter('<'); 535 bufferCharacter('/'); 536 HTML_RECONSUME_IN(ScriptDataState); 537 } 538 } 539 END_STATE() 540 541 HTML_BEGIN_STATE(ScriptDataEndTagNameState) { 542 if (isASCIIUpper(cc)) { 543 m_temporaryBuffer.append(static_cast<LChar>(cc)); 544 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 545 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 546 } else if (isASCIILower(cc)) { 547 m_temporaryBuffer.append(static_cast<LChar>(cc)); 548 addToPossibleEndTag(static_cast<LChar>(cc)); 549 HTML_ADVANCE_TO(ScriptDataEndTagNameState); 550 } else { 551 if (isTokenizerWhitespace(cc)) { 552 if (isAppropriateEndTag()) { 553 m_temporaryBuffer.append(static_cast<LChar>(cc)); 554 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 555 } 556 } else if (cc == '/') { 557 if (isAppropriateEndTag()) { 558 m_temporaryBuffer.append(static_cast<LChar>(cc)); 559 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 560 } 561 } else if (cc == '>') { 562 if (isAppropriateEndTag()) { 563 m_temporaryBuffer.append(static_cast<LChar>(cc)); 564 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 565 } 566 } 567 bufferCharacter('<'); 568 bufferCharacter('/'); 569 m_token->appendToCharacter(m_temporaryBuffer); 570 m_bufferedEndTagName.clear(); 571 m_temporaryBuffer.clear(); 572 HTML_RECONSUME_IN(ScriptDataState); 573 } 574 } 575 END_STATE() 576 577 HTML_BEGIN_STATE(ScriptDataEscapeStartState) { 578 if (cc == '-') { 579 bufferCharacter(cc); 580 HTML_ADVANCE_TO(ScriptDataEscapeStartDashState); 581 } else 582 HTML_RECONSUME_IN(ScriptDataState); 583 } 584 END_STATE() 585 586 HTML_BEGIN_STATE(ScriptDataEscapeStartDashState) { 587 if (cc == '-') { 588 bufferCharacter(cc); 589 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState); 590 } else 591 HTML_RECONSUME_IN(ScriptDataState); 592 } 593 END_STATE() 594 595 HTML_BEGIN_STATE(ScriptDataEscapedState) { 596 if (cc == '-') { 597 bufferCharacter(cc); 598 HTML_ADVANCE_TO(ScriptDataEscapedDashState); 599 } else if (cc == '<') 600 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState); 601 else if (cc == kEndOfFileMarker) { 602 parseError(); 603 HTML_RECONSUME_IN(DataState); 604 } else { 605 bufferCharacter(cc); 606 HTML_ADVANCE_TO(ScriptDataEscapedState); 607 } 608 } 609 END_STATE() 610 611 HTML_BEGIN_STATE(ScriptDataEscapedDashState) { 612 if (cc == '-') { 613 bufferCharacter(cc); 614 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState); 615 } else if (cc == '<') 616 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState); 617 else if (cc == kEndOfFileMarker) { 618 parseError(); 619 HTML_RECONSUME_IN(DataState); 620 } else { 621 bufferCharacter(cc); 622 HTML_ADVANCE_TO(ScriptDataEscapedState); 623 } 624 } 625 END_STATE() 626 627 HTML_BEGIN_STATE(ScriptDataEscapedDashDashState) { 628 if (cc == '-') { 629 bufferCharacter(cc); 630 HTML_ADVANCE_TO(ScriptDataEscapedDashDashState); 631 } else if (cc == '<') 632 HTML_ADVANCE_TO(ScriptDataEscapedLessThanSignState); 633 else if (cc == '>') { 634 bufferCharacter(cc); 635 HTML_ADVANCE_TO(ScriptDataState); 636 } else if (cc == kEndOfFileMarker) { 637 parseError(); 638 HTML_RECONSUME_IN(DataState); 639 } else { 640 bufferCharacter(cc); 641 HTML_ADVANCE_TO(ScriptDataEscapedState); 642 } 643 } 644 END_STATE() 645 646 HTML_BEGIN_STATE(ScriptDataEscapedLessThanSignState) { 647 if (cc == '/') { 648 m_temporaryBuffer.clear(); 649 ASSERT(m_bufferedEndTagName.isEmpty()); 650 HTML_ADVANCE_TO(ScriptDataEscapedEndTagOpenState); 651 } else if (isASCIIUpper(cc)) { 652 bufferCharacter('<'); 653 bufferCharacter(cc); 654 m_temporaryBuffer.clear(); 655 m_temporaryBuffer.append(toLowerCase(cc)); 656 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 657 } else if (isASCIILower(cc)) { 658 bufferCharacter('<'); 659 bufferCharacter(cc); 660 m_temporaryBuffer.clear(); 661 m_temporaryBuffer.append(static_cast<LChar>(cc)); 662 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 663 } else { 664 bufferCharacter('<'); 665 HTML_RECONSUME_IN(ScriptDataEscapedState); 666 } 667 } 668 END_STATE() 669 670 HTML_BEGIN_STATE(ScriptDataEscapedEndTagOpenState) { 671 if (isASCIIUpper(cc)) { 672 m_temporaryBuffer.append(static_cast<LChar>(cc)); 673 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 674 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 675 } else if (isASCIILower(cc)) { 676 m_temporaryBuffer.append(static_cast<LChar>(cc)); 677 addToPossibleEndTag(static_cast<LChar>(cc)); 678 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 679 } else { 680 bufferCharacter('<'); 681 bufferCharacter('/'); 682 HTML_RECONSUME_IN(ScriptDataEscapedState); 683 } 684 } 685 END_STATE() 686 687 HTML_BEGIN_STATE(ScriptDataEscapedEndTagNameState) { 688 if (isASCIIUpper(cc)) { 689 m_temporaryBuffer.append(static_cast<LChar>(cc)); 690 addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc))); 691 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 692 } else if (isASCIILower(cc)) { 693 m_temporaryBuffer.append(static_cast<LChar>(cc)); 694 addToPossibleEndTag(static_cast<LChar>(cc)); 695 HTML_ADVANCE_TO(ScriptDataEscapedEndTagNameState); 696 } else { 697 if (isTokenizerWhitespace(cc)) { 698 if (isAppropriateEndTag()) { 699 m_temporaryBuffer.append(static_cast<LChar>(cc)); 700 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState); 701 } 702 } else if (cc == '/') { 703 if (isAppropriateEndTag()) { 704 m_temporaryBuffer.append(static_cast<LChar>(cc)); 705 FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState); 706 } 707 } else if (cc == '>') { 708 if (isAppropriateEndTag()) { 709 m_temporaryBuffer.append(static_cast<LChar>(cc)); 710 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState); 711 } 712 } 713 bufferCharacter('<'); 714 bufferCharacter('/'); 715 m_token->appendToCharacter(m_temporaryBuffer); 716 m_bufferedEndTagName.clear(); 717 m_temporaryBuffer.clear(); 718 HTML_RECONSUME_IN(ScriptDataEscapedState); 719 } 720 } 721 END_STATE() 722 723 HTML_BEGIN_STATE(ScriptDataDoubleEscapeStartState) { 724 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') { 725 bufferCharacter(cc); 726 if (temporaryBufferIs(scriptTag.localName())) 727 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 728 else 729 HTML_ADVANCE_TO(ScriptDataEscapedState); 730 } else if (isASCIIUpper(cc)) { 731 bufferCharacter(cc); 732 m_temporaryBuffer.append(toLowerCase(cc)); 733 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 734 } else if (isASCIILower(cc)) { 735 bufferCharacter(cc); 736 m_temporaryBuffer.append(static_cast<LChar>(cc)); 737 HTML_ADVANCE_TO(ScriptDataDoubleEscapeStartState); 738 } else 739 HTML_RECONSUME_IN(ScriptDataEscapedState); 740 } 741 END_STATE() 742 743 HTML_BEGIN_STATE(ScriptDataDoubleEscapedState) { 744 if (cc == '-') { 745 bufferCharacter(cc); 746 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashState); 747 } else if (cc == '<') { 748 bufferCharacter(cc); 749 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 750 } else if (cc == kEndOfFileMarker) { 751 parseError(); 752 HTML_RECONSUME_IN(DataState); 753 } else { 754 bufferCharacter(cc); 755 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 756 } 757 } 758 END_STATE() 759 760 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashState) { 761 if (cc == '-') { 762 bufferCharacter(cc); 763 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState); 764 } else if (cc == '<') { 765 bufferCharacter(cc); 766 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 767 } else if (cc == kEndOfFileMarker) { 768 parseError(); 769 HTML_RECONSUME_IN(DataState); 770 } else { 771 bufferCharacter(cc); 772 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 773 } 774 } 775 END_STATE() 776 777 HTML_BEGIN_STATE(ScriptDataDoubleEscapedDashDashState) { 778 if (cc == '-') { 779 bufferCharacter(cc); 780 HTML_ADVANCE_TO(ScriptDataDoubleEscapedDashDashState); 781 } else if (cc == '<') { 782 bufferCharacter(cc); 783 HTML_ADVANCE_TO(ScriptDataDoubleEscapedLessThanSignState); 784 } else if (cc == '>') { 785 bufferCharacter(cc); 786 HTML_ADVANCE_TO(ScriptDataState); 787 } else if (cc == kEndOfFileMarker) { 788 parseError(); 789 HTML_RECONSUME_IN(DataState); 790 } else { 791 bufferCharacter(cc); 792 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 793 } 794 } 795 END_STATE() 796 797 HTML_BEGIN_STATE(ScriptDataDoubleEscapedLessThanSignState) { 798 if (cc == '/') { 799 bufferCharacter(cc); 800 m_temporaryBuffer.clear(); 801 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState); 802 } else 803 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState); 804 } 805 END_STATE() 806 807 HTML_BEGIN_STATE(ScriptDataDoubleEscapeEndState) { 808 if (isTokenizerWhitespace(cc) || cc == '/' || cc == '>') { 809 bufferCharacter(cc); 810 if (temporaryBufferIs(scriptTag.localName())) 811 HTML_ADVANCE_TO(ScriptDataEscapedState); 812 else 813 HTML_ADVANCE_TO(ScriptDataDoubleEscapedState); 814 } else if (isASCIIUpper(cc)) { 815 bufferCharacter(cc); 816 m_temporaryBuffer.append(toLowerCase(cc)); 817 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState); 818 } else if (isASCIILower(cc)) { 819 bufferCharacter(cc); 820 m_temporaryBuffer.append(static_cast<LChar>(cc)); 821 HTML_ADVANCE_TO(ScriptDataDoubleEscapeEndState); 822 } else 823 HTML_RECONSUME_IN(ScriptDataDoubleEscapedState); 824 } 825 END_STATE() 826 827 HTML_BEGIN_STATE(BeforeAttributeNameState) { 828 if (isTokenizerWhitespace(cc)) 829 HTML_ADVANCE_TO(BeforeAttributeNameState); 830 else if (cc == '/') 831 HTML_ADVANCE_TO(SelfClosingStartTagState); 832 else if (cc == '>') 833 return emitAndResumeIn(source, HTMLTokenizer::DataState); 834 else if (isASCIIUpper(cc)) { 835 m_token->addNewAttribute(); 836 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 837 m_token->appendToAttributeName(toLowerCase(cc)); 838 HTML_ADVANCE_TO(AttributeNameState); 839 } else if (cc == kEndOfFileMarker) { 840 parseError(); 841 HTML_RECONSUME_IN(DataState); 842 } else { 843 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=') 844 parseError(); 845 m_token->addNewAttribute(); 846 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 847 m_token->appendToAttributeName(cc); 848 HTML_ADVANCE_TO(AttributeNameState); 849 } 850 } 851 END_STATE() 852 853 HTML_BEGIN_STATE(AttributeNameState) { 854 if (isTokenizerWhitespace(cc)) { 855 m_token->endAttributeName(source.numberOfCharactersConsumed()); 856 HTML_ADVANCE_TO(AfterAttributeNameState); 857 } else if (cc == '/') { 858 m_token->endAttributeName(source.numberOfCharactersConsumed()); 859 HTML_ADVANCE_TO(SelfClosingStartTagState); 860 } else if (cc == '=') { 861 m_token->endAttributeName(source.numberOfCharactersConsumed()); 862 HTML_ADVANCE_TO(BeforeAttributeValueState); 863 } else if (cc == '>') { 864 m_token->endAttributeName(source.numberOfCharactersConsumed()); 865 return emitAndResumeIn(source, HTMLTokenizer::DataState); 866 } else if (isASCIIUpper(cc)) { 867 m_token->appendToAttributeName(toLowerCase(cc)); 868 HTML_ADVANCE_TO(AttributeNameState); 869 } else if (cc == kEndOfFileMarker) { 870 parseError(); 871 m_token->endAttributeName(source.numberOfCharactersConsumed()); 872 HTML_RECONSUME_IN(DataState); 873 } else { 874 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=') 875 parseError(); 876 m_token->appendToAttributeName(cc); 877 HTML_ADVANCE_TO(AttributeNameState); 878 } 879 } 880 END_STATE() 881 882 HTML_BEGIN_STATE(AfterAttributeNameState) { 883 if (isTokenizerWhitespace(cc)) 884 HTML_ADVANCE_TO(AfterAttributeNameState); 885 else if (cc == '/') 886 HTML_ADVANCE_TO(SelfClosingStartTagState); 887 else if (cc == '=') 888 HTML_ADVANCE_TO(BeforeAttributeValueState); 889 else if (cc == '>') 890 return emitAndResumeIn(source, HTMLTokenizer::DataState); 891 else if (isASCIIUpper(cc)) { 892 m_token->addNewAttribute(); 893 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 894 m_token->appendToAttributeName(toLowerCase(cc)); 895 HTML_ADVANCE_TO(AttributeNameState); 896 } else if (cc == kEndOfFileMarker) { 897 parseError(); 898 HTML_RECONSUME_IN(DataState); 899 } else { 900 if (cc == '"' || cc == '\'' || cc == '<') 901 parseError(); 902 m_token->addNewAttribute(); 903 m_token->beginAttributeName(source.numberOfCharactersConsumed()); 904 m_token->appendToAttributeName(cc); 905 HTML_ADVANCE_TO(AttributeNameState); 906 } 907 } 908 END_STATE() 909 910 HTML_BEGIN_STATE(BeforeAttributeValueState) { 911 if (isTokenizerWhitespace(cc)) 912 HTML_ADVANCE_TO(BeforeAttributeValueState); 913 else if (cc == '"') { 914 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1); 915 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); 916 } else if (cc == '&') { 917 m_token->beginAttributeValue(source.numberOfCharactersConsumed()); 918 HTML_RECONSUME_IN(AttributeValueUnquotedState); 919 } else if (cc == '\'') { 920 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1); 921 HTML_ADVANCE_TO(AttributeValueSingleQuotedState); 922 } else if (cc == '>') { 923 parseError(); 924 return emitAndResumeIn(source, HTMLTokenizer::DataState); 925 } else if (cc == kEndOfFileMarker) { 926 parseError(); 927 HTML_RECONSUME_IN(DataState); 928 } else { 929 if (cc == '<' || cc == '=' || cc == '`') 930 parseError(); 931 m_token->beginAttributeValue(source.numberOfCharactersConsumed()); 932 m_token->appendToAttributeValue(cc); 933 HTML_ADVANCE_TO(AttributeValueUnquotedState); 934 } 935 } 936 END_STATE() 937 938 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) { 939 if (cc == '"') { 940 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 941 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); 942 } else if (cc == '&') { 943 m_additionalAllowedCharacter = '"'; 944 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 945 } else if (cc == kEndOfFileMarker) { 946 parseError(); 947 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 948 HTML_RECONSUME_IN(DataState); 949 } else { 950 m_token->appendToAttributeValue(cc); 951 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); 952 } 953 } 954 END_STATE() 955 956 HTML_BEGIN_STATE(AttributeValueSingleQuotedState) { 957 if (cc == '\'') { 958 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 959 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); 960 } else if (cc == '&') { 961 m_additionalAllowedCharacter = '\''; 962 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 963 } else if (cc == kEndOfFileMarker) { 964 parseError(); 965 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 966 HTML_RECONSUME_IN(DataState); 967 } else { 968 m_token->appendToAttributeValue(cc); 969 HTML_ADVANCE_TO(AttributeValueSingleQuotedState); 970 } 971 } 972 END_STATE() 973 974 HTML_BEGIN_STATE(AttributeValueUnquotedState) { 975 if (isTokenizerWhitespace(cc)) { 976 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 977 HTML_ADVANCE_TO(BeforeAttributeNameState); 978 } else if (cc == '&') { 979 m_additionalAllowedCharacter = '>'; 980 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 981 } else if (cc == '>') { 982 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 983 return emitAndResumeIn(source, HTMLTokenizer::DataState); 984 } else if (cc == kEndOfFileMarker) { 985 parseError(); 986 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 987 HTML_RECONSUME_IN(DataState); 988 } else { 989 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`') 990 parseError(); 991 m_token->appendToAttributeValue(cc); 992 HTML_ADVANCE_TO(AttributeValueUnquotedState); 993 } 994 } 995 END_STATE() 996 997 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) { 998 bool notEnoughCharacters = false; 999 DecodedHTMLEntity decodedEntity; 1000 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters, m_additionalAllowedCharacter); 1001 if (notEnoughCharacters) 1002 return haveBufferedCharacterToken(); 1003 if (!success) { 1004 ASSERT(decodedEntity.isEmpty()); 1005 m_token->appendToAttributeValue('&'); 1006 } else { 1007 for (unsigned i = 0; i < decodedEntity.length; ++i) 1008 m_token->appendToAttributeValue(decodedEntity.data[i]); 1009 } 1010 // We're supposed to switch back to the attribute value state that 1011 // we were in when we were switched into this state. Rather than 1012 // keeping track of this explictly, we observe that the previous 1013 // state can be determined by m_additionalAllowedCharacter. 1014 if (m_additionalAllowedCharacter == '"') 1015 HTML_SWITCH_TO(AttributeValueDoubleQuotedState); 1016 else if (m_additionalAllowedCharacter == '\'') 1017 HTML_SWITCH_TO(AttributeValueSingleQuotedState); 1018 else if (m_additionalAllowedCharacter == '>') 1019 HTML_SWITCH_TO(AttributeValueUnquotedState); 1020 else 1021 ASSERT_NOT_REACHED(); 1022 } 1023 END_STATE() 1024 1025 HTML_BEGIN_STATE(AfterAttributeValueQuotedState) { 1026 if (isTokenizerWhitespace(cc)) 1027 HTML_ADVANCE_TO(BeforeAttributeNameState); 1028 else if (cc == '/') 1029 HTML_ADVANCE_TO(SelfClosingStartTagState); 1030 else if (cc == '>') 1031 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1032 else if (cc == kEndOfFileMarker) { 1033 parseError(); 1034 HTML_RECONSUME_IN(DataState); 1035 } else { 1036 parseError(); 1037 HTML_RECONSUME_IN(BeforeAttributeNameState); 1038 } 1039 } 1040 END_STATE() 1041 1042 HTML_BEGIN_STATE(SelfClosingStartTagState) { 1043 if (cc == '>') { 1044 m_token->setSelfClosing(); 1045 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1046 } else if (cc == kEndOfFileMarker) { 1047 parseError(); 1048 HTML_RECONSUME_IN(DataState); 1049 } else { 1050 parseError(); 1051 HTML_RECONSUME_IN(BeforeAttributeNameState); 1052 } 1053 } 1054 END_STATE() 1055 1056 HTML_BEGIN_STATE(BogusCommentState) { 1057 m_token->beginComment(); 1058 HTML_RECONSUME_IN(ContinueBogusCommentState); 1059 } 1060 END_STATE() 1061 1062 HTML_BEGIN_STATE(ContinueBogusCommentState) { 1063 if (cc == '>') 1064 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1065 else if (cc == kEndOfFileMarker) 1066 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1067 else { 1068 m_token->appendToComment(cc); 1069 HTML_ADVANCE_TO(ContinueBogusCommentState); 1070 } 1071 } 1072 END_STATE() 1073 1074 HTML_BEGIN_STATE(MarkupDeclarationOpenState) { 1075 DEFINE_STATIC_LOCAL(String, dashDashString, ("--")); 1076 DEFINE_STATIC_LOCAL(String, doctypeString, ("doctype")); 1077 DEFINE_STATIC_LOCAL(String, cdataString, ("[CDATA[")); 1078 if (cc == '-') { 1079 SegmentedString::LookAheadResult result = source.lookAhead(dashDashString); 1080 if (result == SegmentedString::DidMatch) { 1081 source.advanceAndASSERT('-'); 1082 source.advanceAndASSERT('-'); 1083 m_token->beginComment(); 1084 HTML_SWITCH_TO(CommentStartState); 1085 } else if (result == SegmentedString::NotEnoughCharacters) 1086 return haveBufferedCharacterToken(); 1087 } else if (cc == 'D' || cc == 'd') { 1088 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(doctypeString); 1089 if (result == SegmentedString::DidMatch) { 1090 advanceStringAndASSERTIgnoringCase(source, "doctype"); 1091 HTML_SWITCH_TO(DOCTYPEState); 1092 } else if (result == SegmentedString::NotEnoughCharacters) 1093 return haveBufferedCharacterToken(); 1094 } else if (cc == '[' && shouldAllowCDATA()) { 1095 SegmentedString::LookAheadResult result = source.lookAhead(cdataString); 1096 if (result == SegmentedString::DidMatch) { 1097 advanceStringAndASSERT(source, "[CDATA["); 1098 HTML_SWITCH_TO(CDATASectionState); 1099 } else if (result == SegmentedString::NotEnoughCharacters) 1100 return haveBufferedCharacterToken(); 1101 } 1102 parseError(); 1103 HTML_RECONSUME_IN(BogusCommentState); 1104 } 1105 END_STATE() 1106 1107 HTML_BEGIN_STATE(CommentStartState) { 1108 if (cc == '-') 1109 HTML_ADVANCE_TO(CommentStartDashState); 1110 else if (cc == '>') { 1111 parseError(); 1112 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1113 } else if (cc == kEndOfFileMarker) { 1114 parseError(); 1115 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1116 } else { 1117 m_token->appendToComment(cc); 1118 HTML_ADVANCE_TO(CommentState); 1119 } 1120 } 1121 END_STATE() 1122 1123 HTML_BEGIN_STATE(CommentStartDashState) { 1124 if (cc == '-') 1125 HTML_ADVANCE_TO(CommentEndState); 1126 else if (cc == '>') { 1127 parseError(); 1128 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1129 } else if (cc == kEndOfFileMarker) { 1130 parseError(); 1131 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1132 } else { 1133 m_token->appendToComment('-'); 1134 m_token->appendToComment(cc); 1135 HTML_ADVANCE_TO(CommentState); 1136 } 1137 } 1138 END_STATE() 1139 1140 HTML_BEGIN_STATE(CommentState) { 1141 if (cc == '-') 1142 HTML_ADVANCE_TO(CommentEndDashState); 1143 else if (cc == kEndOfFileMarker) { 1144 parseError(); 1145 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1146 } else { 1147 m_token->appendToComment(cc); 1148 HTML_ADVANCE_TO(CommentState); 1149 } 1150 } 1151 END_STATE() 1152 1153 HTML_BEGIN_STATE(CommentEndDashState) { 1154 if (cc == '-') 1155 HTML_ADVANCE_TO(CommentEndState); 1156 else if (cc == kEndOfFileMarker) { 1157 parseError(); 1158 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1159 } else { 1160 m_token->appendToComment('-'); 1161 m_token->appendToComment(cc); 1162 HTML_ADVANCE_TO(CommentState); 1163 } 1164 } 1165 END_STATE() 1166 1167 HTML_BEGIN_STATE(CommentEndState) { 1168 if (cc == '>') 1169 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1170 else if (cc == '!') { 1171 parseError(); 1172 HTML_ADVANCE_TO(CommentEndBangState); 1173 } else if (cc == '-') { 1174 parseError(); 1175 m_token->appendToComment('-'); 1176 HTML_ADVANCE_TO(CommentEndState); 1177 } else if (cc == kEndOfFileMarker) { 1178 parseError(); 1179 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1180 } else { 1181 parseError(); 1182 m_token->appendToComment('-'); 1183 m_token->appendToComment('-'); 1184 m_token->appendToComment(cc); 1185 HTML_ADVANCE_TO(CommentState); 1186 } 1187 } 1188 END_STATE() 1189 1190 HTML_BEGIN_STATE(CommentEndBangState) { 1191 if (cc == '-') { 1192 m_token->appendToComment('-'); 1193 m_token->appendToComment('-'); 1194 m_token->appendToComment('!'); 1195 HTML_ADVANCE_TO(CommentEndDashState); 1196 } else if (cc == '>') 1197 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1198 else if (cc == kEndOfFileMarker) { 1199 parseError(); 1200 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1201 } else { 1202 m_token->appendToComment('-'); 1203 m_token->appendToComment('-'); 1204 m_token->appendToComment('!'); 1205 m_token->appendToComment(cc); 1206 HTML_ADVANCE_TO(CommentState); 1207 } 1208 } 1209 END_STATE() 1210 1211 HTML_BEGIN_STATE(DOCTYPEState) { 1212 if (isTokenizerWhitespace(cc)) 1213 HTML_ADVANCE_TO(BeforeDOCTYPENameState); 1214 else if (cc == kEndOfFileMarker) { 1215 parseError(); 1216 m_token->beginDOCTYPE(); 1217 m_token->setForceQuirks(); 1218 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1219 } else { 1220 parseError(); 1221 HTML_RECONSUME_IN(BeforeDOCTYPENameState); 1222 } 1223 } 1224 END_STATE() 1225 1226 HTML_BEGIN_STATE(BeforeDOCTYPENameState) { 1227 if (isTokenizerWhitespace(cc)) 1228 HTML_ADVANCE_TO(BeforeDOCTYPENameState); 1229 else if (isASCIIUpper(cc)) { 1230 m_token->beginDOCTYPE(toLowerCase(cc)); 1231 HTML_ADVANCE_TO(DOCTYPENameState); 1232 } else if (cc == '>') { 1233 parseError(); 1234 m_token->beginDOCTYPE(); 1235 m_token->setForceQuirks(); 1236 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1237 } else if (cc == kEndOfFileMarker) { 1238 parseError(); 1239 m_token->beginDOCTYPE(); 1240 m_token->setForceQuirks(); 1241 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1242 } else { 1243 m_token->beginDOCTYPE(cc); 1244 HTML_ADVANCE_TO(DOCTYPENameState); 1245 } 1246 } 1247 END_STATE() 1248 1249 HTML_BEGIN_STATE(DOCTYPENameState) { 1250 if (isTokenizerWhitespace(cc)) 1251 HTML_ADVANCE_TO(AfterDOCTYPENameState); 1252 else if (cc == '>') 1253 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1254 else if (isASCIIUpper(cc)) { 1255 m_token->appendToName(toLowerCase(cc)); 1256 HTML_ADVANCE_TO(DOCTYPENameState); 1257 } else if (cc == kEndOfFileMarker) { 1258 parseError(); 1259 m_token->setForceQuirks(); 1260 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1261 } else { 1262 m_token->appendToName(cc); 1263 HTML_ADVANCE_TO(DOCTYPENameState); 1264 } 1265 } 1266 END_STATE() 1267 1268 HTML_BEGIN_STATE(AfterDOCTYPENameState) { 1269 if (isTokenizerWhitespace(cc)) 1270 HTML_ADVANCE_TO(AfterDOCTYPENameState); 1271 if (cc == '>') 1272 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1273 else if (cc == kEndOfFileMarker) { 1274 parseError(); 1275 m_token->setForceQuirks(); 1276 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1277 } else { 1278 DEFINE_STATIC_LOCAL(String, publicString, ("public")); 1279 DEFINE_STATIC_LOCAL(String, systemString, ("system")); 1280 if (cc == 'P' || cc == 'p') { 1281 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(publicString); 1282 if (result == SegmentedString::DidMatch) { 1283 advanceStringAndASSERTIgnoringCase(source, "public"); 1284 HTML_SWITCH_TO(AfterDOCTYPEPublicKeywordState); 1285 } else if (result == SegmentedString::NotEnoughCharacters) 1286 return haveBufferedCharacterToken(); 1287 } else if (cc == 'S' || cc == 's') { 1288 SegmentedString::LookAheadResult result = source.lookAheadIgnoringCase(systemString); 1289 if (result == SegmentedString::DidMatch) { 1290 advanceStringAndASSERTIgnoringCase(source, "system"); 1291 HTML_SWITCH_TO(AfterDOCTYPESystemKeywordState); 1292 } else if (result == SegmentedString::NotEnoughCharacters) 1293 return haveBufferedCharacterToken(); 1294 } 1295 parseError(); 1296 m_token->setForceQuirks(); 1297 HTML_ADVANCE_TO(BogusDOCTYPEState); 1298 } 1299 } 1300 END_STATE() 1301 1302 HTML_BEGIN_STATE(AfterDOCTYPEPublicKeywordState) { 1303 if (isTokenizerWhitespace(cc)) 1304 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState); 1305 else if (cc == '"') { 1306 parseError(); 1307 m_token->setPublicIdentifierToEmptyString(); 1308 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1309 } else if (cc == '\'') { 1310 parseError(); 1311 m_token->setPublicIdentifierToEmptyString(); 1312 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1313 } else if (cc == '>') { 1314 parseError(); 1315 m_token->setForceQuirks(); 1316 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1317 } else if (cc == kEndOfFileMarker) { 1318 parseError(); 1319 m_token->setForceQuirks(); 1320 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1321 } else { 1322 parseError(); 1323 m_token->setForceQuirks(); 1324 HTML_ADVANCE_TO(BogusDOCTYPEState); 1325 } 1326 } 1327 END_STATE() 1328 1329 HTML_BEGIN_STATE(BeforeDOCTYPEPublicIdentifierState) { 1330 if (isTokenizerWhitespace(cc)) 1331 HTML_ADVANCE_TO(BeforeDOCTYPEPublicIdentifierState); 1332 else if (cc == '"') { 1333 m_token->setPublicIdentifierToEmptyString(); 1334 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1335 } else if (cc == '\'') { 1336 m_token->setPublicIdentifierToEmptyString(); 1337 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1338 } else if (cc == '>') { 1339 parseError(); 1340 m_token->setForceQuirks(); 1341 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1342 } else if (cc == kEndOfFileMarker) { 1343 parseError(); 1344 m_token->setForceQuirks(); 1345 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1346 } else { 1347 parseError(); 1348 m_token->setForceQuirks(); 1349 HTML_ADVANCE_TO(BogusDOCTYPEState); 1350 } 1351 } 1352 END_STATE() 1353 1354 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuotedState) { 1355 if (cc == '"') 1356 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState); 1357 else if (cc == '>') { 1358 parseError(); 1359 m_token->setForceQuirks(); 1360 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1361 } else if (cc == kEndOfFileMarker) { 1362 parseError(); 1363 m_token->setForceQuirks(); 1364 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1365 } else { 1366 m_token->appendToPublicIdentifier(cc); 1367 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierDoubleQuotedState); 1368 } 1369 } 1370 END_STATE() 1371 1372 HTML_BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuotedState) { 1373 if (cc == '\'') 1374 HTML_ADVANCE_TO(AfterDOCTYPEPublicIdentifierState); 1375 else if (cc == '>') { 1376 parseError(); 1377 m_token->setForceQuirks(); 1378 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1379 } else if (cc == kEndOfFileMarker) { 1380 parseError(); 1381 m_token->setForceQuirks(); 1382 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1383 } else { 1384 m_token->appendToPublicIdentifier(cc); 1385 HTML_ADVANCE_TO(DOCTYPEPublicIdentifierSingleQuotedState); 1386 } 1387 } 1388 END_STATE() 1389 1390 HTML_BEGIN_STATE(AfterDOCTYPEPublicIdentifierState) { 1391 if (isTokenizerWhitespace(cc)) 1392 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState); 1393 else if (cc == '>') 1394 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1395 else if (cc == '"') { 1396 parseError(); 1397 m_token->setSystemIdentifierToEmptyString(); 1398 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1399 } else if (cc == '\'') { 1400 parseError(); 1401 m_token->setSystemIdentifierToEmptyString(); 1402 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1403 } else if (cc == kEndOfFileMarker) { 1404 parseError(); 1405 m_token->setForceQuirks(); 1406 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1407 } else { 1408 parseError(); 1409 m_token->setForceQuirks(); 1410 HTML_ADVANCE_TO(BogusDOCTYPEState); 1411 } 1412 } 1413 END_STATE() 1414 1415 HTML_BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiersState) { 1416 if (isTokenizerWhitespace(cc)) 1417 HTML_ADVANCE_TO(BetweenDOCTYPEPublicAndSystemIdentifiersState); 1418 else if (cc == '>') 1419 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1420 else if (cc == '"') { 1421 m_token->setSystemIdentifierToEmptyString(); 1422 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1423 } else if (cc == '\'') { 1424 m_token->setSystemIdentifierToEmptyString(); 1425 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1426 } else if (cc == kEndOfFileMarker) { 1427 parseError(); 1428 m_token->setForceQuirks(); 1429 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1430 } else { 1431 parseError(); 1432 m_token->setForceQuirks(); 1433 HTML_ADVANCE_TO(BogusDOCTYPEState); 1434 } 1435 } 1436 END_STATE() 1437 1438 HTML_BEGIN_STATE(AfterDOCTYPESystemKeywordState) { 1439 if (isTokenizerWhitespace(cc)) 1440 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState); 1441 else if (cc == '"') { 1442 parseError(); 1443 m_token->setSystemIdentifierToEmptyString(); 1444 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1445 } else if (cc == '\'') { 1446 parseError(); 1447 m_token->setSystemIdentifierToEmptyString(); 1448 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1449 } else if (cc == '>') { 1450 parseError(); 1451 m_token->setForceQuirks(); 1452 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1453 } else if (cc == kEndOfFileMarker) { 1454 parseError(); 1455 m_token->setForceQuirks(); 1456 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1457 } else { 1458 parseError(); 1459 m_token->setForceQuirks(); 1460 HTML_ADVANCE_TO(BogusDOCTYPEState); 1461 } 1462 } 1463 END_STATE() 1464 1465 HTML_BEGIN_STATE(BeforeDOCTYPESystemIdentifierState) { 1466 if (isTokenizerWhitespace(cc)) 1467 HTML_ADVANCE_TO(BeforeDOCTYPESystemIdentifierState); 1468 if (cc == '"') { 1469 m_token->setSystemIdentifierToEmptyString(); 1470 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1471 } else if (cc == '\'') { 1472 m_token->setSystemIdentifierToEmptyString(); 1473 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1474 } else if (cc == '>') { 1475 parseError(); 1476 m_token->setForceQuirks(); 1477 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1478 } else if (cc == kEndOfFileMarker) { 1479 parseError(); 1480 m_token->setForceQuirks(); 1481 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1482 } else { 1483 parseError(); 1484 m_token->setForceQuirks(); 1485 HTML_ADVANCE_TO(BogusDOCTYPEState); 1486 } 1487 } 1488 END_STATE() 1489 1490 HTML_BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuotedState) { 1491 if (cc == '"') 1492 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1493 else if (cc == '>') { 1494 parseError(); 1495 m_token->setForceQuirks(); 1496 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1497 } else if (cc == kEndOfFileMarker) { 1498 parseError(); 1499 m_token->setForceQuirks(); 1500 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1501 } else { 1502 m_token->appendToSystemIdentifier(cc); 1503 HTML_ADVANCE_TO(DOCTYPESystemIdentifierDoubleQuotedState); 1504 } 1505 } 1506 END_STATE() 1507 1508 HTML_BEGIN_STATE(DOCTYPESystemIdentifierSingleQuotedState) { 1509 if (cc == '\'') 1510 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1511 else if (cc == '>') { 1512 parseError(); 1513 m_token->setForceQuirks(); 1514 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1515 } else if (cc == kEndOfFileMarker) { 1516 parseError(); 1517 m_token->setForceQuirks(); 1518 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1519 } else { 1520 m_token->appendToSystemIdentifier(cc); 1521 HTML_ADVANCE_TO(DOCTYPESystemIdentifierSingleQuotedState); 1522 } 1523 } 1524 END_STATE() 1525 1526 HTML_BEGIN_STATE(AfterDOCTYPESystemIdentifierState) { 1527 if (isTokenizerWhitespace(cc)) 1528 HTML_ADVANCE_TO(AfterDOCTYPESystemIdentifierState); 1529 else if (cc == '>') 1530 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1531 else if (cc == kEndOfFileMarker) { 1532 parseError(); 1533 m_token->setForceQuirks(); 1534 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1535 } else { 1536 parseError(); 1537 HTML_ADVANCE_TO(BogusDOCTYPEState); 1538 } 1539 } 1540 END_STATE() 1541 1542 HTML_BEGIN_STATE(BogusDOCTYPEState) { 1543 if (cc == '>') 1544 return emitAndResumeIn(source, HTMLTokenizer::DataState); 1545 else if (cc == kEndOfFileMarker) 1546 return emitAndReconsumeIn(source, HTMLTokenizer::DataState); 1547 HTML_ADVANCE_TO(BogusDOCTYPEState); 1548 } 1549 END_STATE() 1550 1551 HTML_BEGIN_STATE(CDATASectionState) { 1552 if (cc == ']') 1553 HTML_ADVANCE_TO(CDATASectionRightSquareBracketState); 1554 else if (cc == kEndOfFileMarker) 1555 HTML_RECONSUME_IN(DataState); 1556 else { 1557 bufferCharacter(cc); 1558 HTML_ADVANCE_TO(CDATASectionState); 1559 } 1560 } 1561 END_STATE() 1562 1563 HTML_BEGIN_STATE(CDATASectionRightSquareBracketState) { 1564 if (cc == ']') 1565 HTML_ADVANCE_TO(CDATASectionDoubleRightSquareBracketState); 1566 else { 1567 bufferCharacter(']'); 1568 HTML_RECONSUME_IN(CDATASectionState); 1569 } 1570 } 1571 1572 HTML_BEGIN_STATE(CDATASectionDoubleRightSquareBracketState) { 1573 if (cc == '>') 1574 HTML_ADVANCE_TO(DataState); 1575 else { 1576 bufferCharacter(']'); 1577 bufferCharacter(']'); 1578 HTML_RECONSUME_IN(CDATASectionState); 1579 } 1580 } 1581 END_STATE() 1582 1583 } 1584 1585 ASSERT_NOT_REACHED(); 1586 return false; 1587 } 1588 1589 String HTMLTokenizer::bufferedCharacters() const 1590 { 1591 // FIXME: Add an assert about m_state. 1592 StringBuilder characters; 1593 characters.reserveCapacity(numberOfBufferedCharacters()); 1594 characters.append('<'); 1595 characters.append('/'); 1596 characters.append(m_temporaryBuffer.data(), m_temporaryBuffer.size()); 1597 return characters.toString(); 1598 } 1599 1600 void HTMLTokenizer::updateStateFor(const AtomicString& tagName) 1601 { 1602 if (tagName == textareaTag || tagName == titleTag) 1603 setState(HTMLTokenizer::RCDATAState); 1604 else if (tagName == plaintextTag) 1605 setState(HTMLTokenizer::PLAINTEXTState); 1606 else if (tagName == scriptTag) 1607 setState(HTMLTokenizer::ScriptDataState); 1608 else if (tagName == styleTag 1609 || tagName == iframeTag 1610 || tagName == xmpTag 1611 || (tagName == noembedTag && m_options.pluginsEnabled) 1612 || tagName == noframesTag 1613 || (tagName == noscriptTag && m_options.scriptEnabled)) 1614 setState(HTMLTokenizer::RAWTEXTState); 1615 } 1616 1617 inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString) 1618 { 1619 return vectorEqualsString(m_temporaryBuffer, expectedString); 1620 } 1621 1622 inline void HTMLTokenizer::addToPossibleEndTag(LChar cc) 1623 { 1624 ASSERT(isEndTagBufferingState(m_state)); 1625 m_bufferedEndTagName.append(cc); 1626 } 1627 1628 inline bool HTMLTokenizer::isAppropriateEndTag() 1629 { 1630 if (m_bufferedEndTagName.size() != m_appropriateEndTagName.size()) 1631 return false; 1632 1633 size_t numCharacters = m_bufferedEndTagName.size(); 1634 1635 for (size_t i = 0; i < numCharacters; i++) { 1636 if (m_bufferedEndTagName[i] != m_appropriateEndTagName[i]) 1637 return false; 1638 } 1639 1640 return true; 1641 } 1642 1643 inline void HTMLTokenizer::parseError() 1644 { 1645 notImplemented(); 1646 } 1647 1648 } 1649