1 /* 2 Copyright (C) 1997 Martin Jones (mjones (at) kde.org) 3 (C) 1997 Torben Weis (weis (at) kde.org) 4 (C) 1998 Waldo Bastian (bastian (at) kde.org) 5 (C) 1999 Lars Knoll (knoll (at) kde.org) 6 (C) 1999 Antti Koivisto (koivisto (at) kde.org) 7 (C) 2001 Dirk Mueller (mueller (at) kde.org) 8 Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 9 Copyright (C) 2005, 2006 Alexey Proskuryakov (ap (at) nypop.com) 10 Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmobile.com/) 11 12 This library is free software; you can redistribute it and/or 13 modify it under the terms of the GNU Library General Public 14 License as published by the Free Software Foundation; either 15 version 2 of the License, or (at your option) any later version. 16 17 This library is distributed in the hope that it will be useful, 18 but WITHOUT ANY WARRANTY; without even the implied warranty of 19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 Library General Public License for more details. 21 22 You should have received a copy of the GNU Library General Public License 23 along with this library; see the file COPYING.LIB. If not, write to 24 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 25 Boston, MA 02110-1301, USA. 26 */ 27 28 #include "config.h" 29 #include "HTMLTokenizer.h" 30 31 #include "CSSHelper.h" 32 #include "Cache.h" 33 #include "CachedScript.h" 34 #include "DocLoader.h" 35 #include "DocumentFragment.h" 36 #include "Event.h" 37 #include "EventNames.h" 38 #include "Frame.h" 39 #include "FrameLoader.h" 40 #include "FrameView.h" 41 #include "HTMLElement.h" 42 #include "HTMLNames.h" 43 #include "HTMLParser.h" 44 #include "HTMLScriptElement.h" 45 #include "HTMLViewSourceDocument.h" 46 #include "ImageLoader.h" 47 #include "InspectorTimelineAgent.h" 48 #include "MappedAttribute.h" 49 #include "Page.h" 50 #include "PreloadScanner.h" 51 #include "ScriptController.h" 52 #include "ScriptSourceCode.h" 53 #include "ScriptValue.h" 54 #include "XSSAuditor.h" 55 #include <wtf/ASCIICType.h> 56 #include <wtf/CurrentTime.h> 57 58 #include "HTMLEntityNames.c" 59 60 #ifdef ANDROID_INSTRUMENT 61 #include "TimeCounter.h" 62 #endif 63 64 #define PRELOAD_SCANNER_ENABLED 1 65 // #define INSTRUMENT_LAYOUT_SCHEDULING 1 66 67 using namespace WTF; 68 using namespace std; 69 70 namespace WebCore { 71 72 using namespace HTMLNames; 73 74 #if MOBILE 75 // The mobile device needs to be responsive, as such the tokenizer chunk size is reduced. 76 // This value is used to define how many characters the tokenizer will process before 77 // yeilding control. 78 static const int defaultTokenizerChunkSize = 256; 79 #else 80 static const int defaultTokenizerChunkSize = 4096; 81 #endif 82 83 #if MOBILE 84 // As the chunks are smaller (above), the tokenizer should not yield for as long a period, otherwise 85 // it will take way to long to load a page. 86 static const double defaultTokenizerTimeDelay = 0.300; 87 #else 88 // FIXME: We would like this constant to be 200ms. 89 // Yielding more aggressively results in increased responsiveness and better incremental rendering. 90 // It slows down overall page-load on slower machines, though, so for now we set a value of 500. 91 static const double defaultTokenizerTimeDelay = 0.500; 92 #endif 93 94 static const char commentStart [] = "<!--"; 95 static const char doctypeStart [] = "<!doctype"; 96 static const char publicStart [] = "public"; 97 static const char systemStart [] = "system"; 98 static const char scriptEnd [] = "</script"; 99 static const char xmpEnd [] = "</xmp"; 100 static const char styleEnd [] = "</style"; 101 static const char textareaEnd [] = "</textarea"; 102 static const char titleEnd [] = "</title"; 103 static const char iframeEnd [] = "</iframe"; 104 105 // Full support for MS Windows extensions to Latin-1. 106 // Technically these extensions should only be activated for pages 107 // marked "windows-1252" or "cp1252", but 108 // in the standard Microsoft way, these extensions infect hundreds of thousands 109 // of web pages. Note that people with non-latin-1 Microsoft extensions 110 // are SOL. 111 // 112 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp 113 // http://www.bbsinc.com/iso8859.html 114 // http://www.obviously.com/ 115 // 116 // There may be better equivalents 117 118 // We only need this for entities. For non-entity text, we handle this in the text encoding. 119 120 static const UChar windowsLatin1ExtensionArray[32] = { 121 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 122 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F 123 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 124 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F 125 }; 126 127 static inline UChar fixUpChar(UChar c) 128 { 129 if ((c & ~0x1F) != 0x0080) 130 return c; 131 return windowsLatin1ExtensionArray[c - 0x80]; 132 } 133 134 static inline bool tagMatch(const char* s1, const UChar* s2, unsigned length) 135 { 136 for (unsigned i = 0; i != length; ++i) { 137 unsigned char c1 = s1[i]; 138 unsigned char uc1 = toASCIIUpper(static_cast<char>(c1)); 139 UChar c2 = s2[i]; 140 if (c1 != c2 && uc1 != c2) 141 return false; 142 } 143 return true; 144 } 145 146 inline void Token::addAttribute(AtomicString& attrName, const AtomicString& attributeValue, bool viewSourceMode) 147 { 148 if (!attrName.isEmpty()) { 149 ASSERT(!attrName.contains('/')); 150 RefPtr<MappedAttribute> a = MappedAttribute::create(attrName, attributeValue); 151 if (!attrs) { 152 attrs = NamedMappedAttrMap::create(); 153 attrs->reserveInitialCapacity(10); 154 } 155 attrs->insertAttribute(a.release(), viewSourceMode); 156 } 157 158 attrName = emptyAtom; 159 } 160 161 // ---------------------------------------------------------------------------- 162 163 HTMLTokenizer::HTMLTokenizer(HTMLDocument* doc, bool reportErrors) 164 : Tokenizer() 165 , m_buffer(0) 166 , m_scriptCode(0) 167 , m_scriptCodeSize(0) 168 , m_scriptCodeCapacity(0) 169 , m_scriptCodeResync(0) 170 , m_executingScript(0) 171 , m_requestingScript(false) 172 , m_hasScriptsWaitingForStylesheets(false) 173 , m_timer(this, &HTMLTokenizer::timerFired) 174 , m_externalScriptsTimer(this, &HTMLTokenizer::executeExternalScriptsTimerFired) 175 , m_doc(doc) 176 , m_parser(new HTMLParser(doc, reportErrors)) 177 , m_inWrite(false) 178 , m_fragment(false) 179 , m_scriptingPermission(FragmentScriptingAllowed) 180 { 181 begin(); 182 } 183 184 HTMLTokenizer::HTMLTokenizer(HTMLViewSourceDocument* doc) 185 : Tokenizer(true) 186 , m_buffer(0) 187 , m_scriptCode(0) 188 , m_scriptCodeSize(0) 189 , m_scriptCodeCapacity(0) 190 , m_scriptCodeResync(0) 191 , m_executingScript(0) 192 , m_requestingScript(false) 193 , m_hasScriptsWaitingForStylesheets(false) 194 , m_timer(this, &HTMLTokenizer::timerFired) 195 , m_externalScriptsTimer(this, &HTMLTokenizer::executeExternalScriptsTimerFired) 196 , m_doc(doc) 197 , m_parser(0) 198 , m_inWrite(false) 199 , m_fragment(false) 200 , m_scriptingPermission(FragmentScriptingAllowed) 201 { 202 begin(); 203 } 204 205 HTMLTokenizer::HTMLTokenizer(DocumentFragment* frag, FragmentScriptingPermission scriptingPermission) 206 : m_buffer(0) 207 , m_scriptCode(0) 208 , m_scriptCodeSize(0) 209 , m_scriptCodeCapacity(0) 210 , m_scriptCodeResync(0) 211 , m_executingScript(0) 212 , m_requestingScript(false) 213 , m_hasScriptsWaitingForStylesheets(false) 214 , m_timer(this, &HTMLTokenizer::timerFired) 215 , m_externalScriptsTimer(this, &HTMLTokenizer::executeExternalScriptsTimerFired) 216 , m_doc(frag->document()) 217 , m_parser(new HTMLParser(frag, scriptingPermission)) 218 , m_inWrite(false) 219 , m_fragment(true) 220 , m_scriptingPermission(scriptingPermission) 221 { 222 begin(); 223 } 224 225 void HTMLTokenizer::reset() 226 { 227 ASSERT(m_executingScript == 0); 228 229 while (!m_pendingScripts.isEmpty()) { 230 CachedScript* cs = m_pendingScripts.first().get(); 231 m_pendingScripts.removeFirst(); 232 ASSERT(cache()->disabled() || cs->accessCount() > 0); 233 cs->removeClient(this); 234 } 235 236 fastFree(m_buffer); 237 m_buffer = m_dest = 0; 238 m_bufferSize = 0; 239 240 fastFree(m_scriptCode); 241 m_scriptCode = 0; 242 m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0; 243 244 m_timer.stop(); 245 m_externalScriptsTimer.stop(); 246 247 m_state.setAllowYield(false); 248 m_state.setForceSynchronous(false); 249 250 m_currentToken.reset(); 251 m_doctypeToken.reset(); 252 m_doctypeSearchCount = 0; 253 m_doctypeSecondarySearchCount = 0; 254 m_hasScriptsWaitingForStylesheets = false; 255 } 256 257 void HTMLTokenizer::begin() 258 { 259 m_executingScript = 0; 260 m_requestingScript = false; 261 m_hasScriptsWaitingForStylesheets = false; 262 m_state.setLoadingExtScript(false); 263 reset(); 264 m_bufferSize = 254; 265 m_buffer = static_cast<UChar*>(fastMalloc(sizeof(UChar) * 254)); 266 m_dest = m_buffer; 267 tquote = NoQuote; 268 searchCount = 0; 269 m_state.setEntityState(NoEntity); 270 m_scriptTagSrcAttrValue = String(); 271 m_pendingSrc.clear(); 272 m_currentPrependingSrc = 0; 273 m_noMoreData = false; 274 m_brokenComments = false; 275 m_brokenServer = false; 276 m_lineNumber = 0; 277 m_currentScriptTagStartLineNumber = 0; 278 m_currentTagStartLineNumber = 0; 279 m_state.setForceSynchronous(false); 280 281 Page* page = m_doc->page(); 282 if (page && page->hasCustomHTMLTokenizerTimeDelay()) 283 m_tokenizerTimeDelay = page->customHTMLTokenizerTimeDelay(); 284 else 285 m_tokenizerTimeDelay = defaultTokenizerTimeDelay; 286 287 if (page && page->hasCustomHTMLTokenizerChunkSize()) 288 m_tokenizerChunkSize = page->customHTMLTokenizerChunkSize(); 289 else 290 m_tokenizerChunkSize = defaultTokenizerChunkSize; 291 } 292 293 void HTMLTokenizer::setForceSynchronous(bool force) 294 { 295 m_state.setForceSynchronous(force); 296 } 297 298 HTMLTokenizer::State HTMLTokenizer::processListing(SegmentedString list, State state) 299 { 300 // This function adds the listing 'list' as 301 // preformatted text-tokens to the token-collection 302 while (!list.isEmpty()) { 303 if (state.skipLF()) { 304 state.setSkipLF(false); 305 if (*list == '\n') { 306 list.advance(); 307 continue; 308 } 309 } 310 311 checkBuffer(); 312 313 if (*list == '\n' || *list == '\r') { 314 if (state.discardLF()) 315 // Ignore this LF 316 state.setDiscardLF(false); // We have discarded 1 LF 317 else 318 *m_dest++ = '\n'; 319 320 /* Check for MS-DOS CRLF sequence */ 321 if (*list == '\r') 322 state.setSkipLF(true); 323 324 list.advance(); 325 } else { 326 state.setDiscardLF(false); 327 *m_dest++ = *list; 328 list.advance(); 329 } 330 } 331 332 return state; 333 } 334 335 HTMLTokenizer::State HTMLTokenizer::parseNonHTMLText(SegmentedString& src, State state) 336 { 337 ASSERT(state.inTextArea() || state.inTitle() || state.inIFrame() || !state.hasEntityState()); 338 ASSERT(!state.hasTagState()); 339 ASSERT(state.inXmp() + state.inTextArea() + state.inTitle() + state.inStyle() + state.inScript() + state.inIFrame() == 1); 340 if (state.inScript() && !m_currentScriptTagStartLineNumber) 341 m_currentScriptTagStartLineNumber = m_lineNumber; 342 343 if (state.inComment()) 344 state = parseComment(src, state); 345 346 int lastDecodedEntityPosition = -1; 347 while (!src.isEmpty()) { 348 checkScriptBuffer(); 349 UChar ch = *src; 350 351 if (!m_scriptCodeResync && !m_brokenComments && 352 !state.inXmp() && ch == '-' && m_scriptCodeSize >= 3 && !src.escaped() && 353 m_scriptCode[m_scriptCodeSize - 3] == '<' && m_scriptCode[m_scriptCodeSize - 2] == '!' && m_scriptCode[m_scriptCodeSize - 1] == '-' && 354 (lastDecodedEntityPosition < m_scriptCodeSize - 3)) { 355 state.setInComment(true); 356 state = parseComment(src, state); 357 continue; 358 } 359 if (m_scriptCodeResync && !tquote && ch == '>') { 360 src.advancePastNonNewline(); 361 m_scriptCodeSize = m_scriptCodeResync - 1; 362 m_scriptCodeResync = 0; 363 m_scriptCode[m_scriptCodeSize] = m_scriptCode[m_scriptCodeSize + 1] = 0; 364 if (state.inScript()) 365 state = scriptHandler(state); 366 else { 367 state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state); 368 processToken(); 369 if (state.inStyle()) { 370 m_currentToken.tagName = styleTag.localName(); 371 m_currentToken.beginTag = false; 372 } else if (state.inTextArea()) { 373 m_currentToken.tagName = textareaTag.localName(); 374 m_currentToken.beginTag = false; 375 } else if (state.inTitle()) { 376 m_currentToken.tagName = titleTag.localName(); 377 m_currentToken.beginTag = false; 378 } else if (state.inXmp()) { 379 m_currentToken.tagName = xmpTag.localName(); 380 m_currentToken.beginTag = false; 381 } else if (state.inIFrame()) { 382 m_currentToken.tagName = iframeTag.localName(); 383 m_currentToken.beginTag = false; 384 } 385 processToken(); 386 state.setInStyle(false); 387 state.setInScript(false); 388 state.setInTextArea(false); 389 state.setInTitle(false); 390 state.setInXmp(false); 391 state.setInIFrame(false); 392 tquote = NoQuote; 393 m_scriptCodeSize = m_scriptCodeResync = 0; 394 } 395 return state; 396 } 397 // possible end of tagname, lets check. 398 if (!m_scriptCodeResync && !state.escaped() && !src.escaped() && (ch == '>' || ch == '/' || isASCIISpace(ch)) && 399 m_scriptCodeSize >= m_searchStopperLength && 400 tagMatch(m_searchStopper, m_scriptCode + m_scriptCodeSize - m_searchStopperLength, m_searchStopperLength) && 401 (lastDecodedEntityPosition < m_scriptCodeSize - m_searchStopperLength)) { 402 m_scriptCodeResync = m_scriptCodeSize-m_searchStopperLength+1; 403 tquote = NoQuote; 404 continue; 405 } 406 if (m_scriptCodeResync && !state.escaped()) { 407 if (ch == '\"') 408 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote); 409 else if (ch == '\'') 410 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote; 411 else if (tquote != NoQuote && (ch == '\r' || ch == '\n')) 412 tquote = NoQuote; 413 } 414 state.setEscaped(!state.escaped() && ch == '\\'); 415 if (!m_scriptCodeResync && (state.inTextArea() || state.inTitle() || state.inIFrame()) && !src.escaped() && ch == '&') { 416 UChar* scriptCodeDest = m_scriptCode + m_scriptCodeSize; 417 src.advancePastNonNewline(); 418 state = parseEntity(src, scriptCodeDest, state, m_cBufferPos, true, false); 419 if (scriptCodeDest == m_scriptCode + m_scriptCodeSize) 420 lastDecodedEntityPosition = m_scriptCodeSize; 421 else 422 m_scriptCodeSize = scriptCodeDest - m_scriptCode; 423 } else { 424 m_scriptCode[m_scriptCodeSize++] = ch; 425 src.advance(m_lineNumber); 426 } 427 } 428 429 return state; 430 } 431 432 HTMLTokenizer::State HTMLTokenizer::scriptHandler(State state) 433 { 434 // We are inside a <script> 435 bool doScriptExec = false; 436 int startLine = m_currentScriptTagStartLineNumber + 1; // Script line numbers are 1 based, HTMLTokenzier line numbers are 0 based 437 438 // Reset m_currentScriptTagStartLineNumber to indicate that we've finished parsing the current script element 439 m_currentScriptTagStartLineNumber = 0; 440 441 // (Bugzilla 3837) Scripts following a frameset element should not execute or, 442 // in the case of extern scripts, even load. 443 bool followingFrameset = (m_doc->body() && m_doc->body()->hasTagName(framesetTag)); 444 445 CachedScript* cs = 0; 446 // don't load external scripts for standalone documents (for now) 447 if (!inViewSourceMode()) { 448 if (!m_scriptTagSrcAttrValue.isEmpty() && m_doc->frame()) { 449 // forget what we just got; load from src url instead 450 if (!m_parser->skipMode() && !followingFrameset) { 451 #ifdef INSTRUMENT_LAYOUT_SCHEDULING 452 if (!m_doc->ownerElement()) 453 printf("Requesting script at time %d\n", m_doc->elapsedTime()); 454 #endif 455 // The parser might have been stopped by for example a window.close call in an earlier script. 456 // If so, we don't want to load scripts. 457 if (!m_parserStopped && m_scriptNode->dispatchBeforeLoadEvent(m_scriptTagSrcAttrValue) && 458 (cs = m_doc->docLoader()->requestScript(m_scriptTagSrcAttrValue, m_scriptTagCharsetAttrValue))) 459 m_pendingScripts.append(cs); 460 else 461 m_scriptNode = 0; 462 } else 463 m_scriptNode = 0; 464 m_scriptTagSrcAttrValue = String(); 465 } else { 466 // Parse m_scriptCode containing <script> info 467 doScriptExec = m_scriptNode->shouldExecuteAsJavaScript(); 468 #if ENABLE(XHTMLMP) 469 if (!doScriptExec) 470 m_doc->setShouldProcessNoscriptElement(true); 471 #endif 472 m_scriptNode = 0; 473 } 474 } 475 476 state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state); 477 RefPtr<Node> node = processToken(); 478 479 if (node && m_scriptingPermission == FragmentScriptingNotAllowed) { 480 ExceptionCode ec; 481 node->remove(ec); 482 node = 0; 483 } 484 485 String scriptString = node ? node->textContent() : ""; 486 m_currentToken.tagName = scriptTag.localName(); 487 m_currentToken.beginTag = false; 488 processToken(); 489 490 state.setInScript(false); 491 m_scriptCodeSize = m_scriptCodeResync = 0; 492 493 // FIXME: The script should be syntax highlighted. 494 if (inViewSourceMode()) 495 return state; 496 497 SegmentedString* savedPrependingSrc = m_currentPrependingSrc; 498 SegmentedString prependingSrc; 499 m_currentPrependingSrc = &prependingSrc; 500 501 #ifdef ANDROID_INSTRUMENT 502 android::TimeCounter::recordNoCounter(android::TimeCounter::ParsingTimeCounter, __FUNCTION__); 503 #endif 504 505 if (!m_parser->skipMode() && !followingFrameset) { 506 if (cs) { 507 if (savedPrependingSrc) 508 savedPrependingSrc->append(m_src); 509 else 510 m_pendingSrc.prepend(m_src); 511 setSrc(SegmentedString()); 512 513 // the ref() call below may call notifyFinished if the script is already in cache, 514 // and that mucks with the state directly, so we must write it back to the object. 515 m_state = state; 516 bool savedRequestingScript = m_requestingScript; 517 m_requestingScript = true; 518 cs->addClient(this); 519 m_requestingScript = savedRequestingScript; 520 state = m_state; 521 // will be 0 if script was already loaded and ref() executed it 522 if (!m_pendingScripts.isEmpty()) 523 state.setLoadingExtScript(true); 524 } else if (!m_fragment && doScriptExec) { 525 if (!m_executingScript) 526 m_pendingSrc.prepend(m_src); 527 else 528 prependingSrc = m_src; 529 setSrc(SegmentedString()); 530 state = scriptExecution(ScriptSourceCode(scriptString, m_doc->frame() ? m_doc->frame()->document()->url() : KURL(), startLine), state); 531 } 532 } 533 534 #ifdef ANDROID_INSTRUMENT 535 android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter); 536 #endif 537 538 if (!m_executingScript && !state.loadingExtScript()) { 539 m_src.append(m_pendingSrc); 540 m_pendingSrc.clear(); 541 } else if (!prependingSrc.isEmpty()) { 542 // restore first so that the write appends in the right place 543 // (does not hurt to do it again below) 544 m_currentPrependingSrc = savedPrependingSrc; 545 546 // we need to do this slightly modified bit of one of the write() cases 547 // because we want to prepend to m_pendingSrc rather than appending 548 // if there's no previous prependingSrc 549 if (!m_pendingScripts.isEmpty()) { 550 if (m_currentPrependingSrc) 551 m_currentPrependingSrc->append(prependingSrc); 552 else 553 m_pendingSrc.prepend(prependingSrc); 554 } else { 555 m_state = state; 556 write(prependingSrc, false); 557 state = m_state; 558 } 559 } 560 561 #if PRELOAD_SCANNER_ENABLED 562 if (!m_pendingScripts.isEmpty() && !m_executingScript) { 563 if (!m_preloadScanner) 564 m_preloadScanner.set(new PreloadScanner(m_doc)); 565 if (!m_preloadScanner->inProgress()) { 566 m_preloadScanner->begin(); 567 m_preloadScanner->write(m_pendingSrc); 568 } 569 } 570 #endif 571 m_currentPrependingSrc = savedPrependingSrc; 572 573 return state; 574 } 575 576 HTMLTokenizer::State HTMLTokenizer::scriptExecution(const ScriptSourceCode& sourceCode, State state) 577 { 578 if (m_fragment || !m_doc->frame()) 579 return state; 580 m_executingScript++; 581 582 SegmentedString* savedPrependingSrc = m_currentPrependingSrc; 583 SegmentedString prependingSrc; 584 m_currentPrependingSrc = &prependingSrc; 585 586 #ifdef INSTRUMENT_LAYOUT_SCHEDULING 587 if (!m_doc->ownerElement()) 588 printf("beginning script execution at %d\n", m_doc->elapsedTime()); 589 #endif 590 591 m_state = state; 592 m_doc->frame()->script()->executeScript(sourceCode); 593 state = m_state; 594 595 state.setAllowYield(true); 596 597 #ifdef INSTRUMENT_LAYOUT_SCHEDULING 598 if (!m_doc->ownerElement()) 599 printf("ending script execution at %d\n", m_doc->elapsedTime()); 600 #endif 601 602 m_executingScript--; 603 604 if (!m_executingScript && !state.loadingExtScript()) { 605 m_pendingSrc.prepend(prependingSrc); 606 m_src.append(m_pendingSrc); 607 m_pendingSrc.clear(); 608 } else if (!prependingSrc.isEmpty()) { 609 // restore first so that the write appends in the right place 610 // (does not hurt to do it again below) 611 m_currentPrependingSrc = savedPrependingSrc; 612 613 // we need to do this slightly modified bit of one of the write() cases 614 // because we want to prepend to m_pendingSrc rather than appending 615 // if there's no previous prependingSrc 616 if (!m_pendingScripts.isEmpty()) { 617 if (m_currentPrependingSrc) 618 m_currentPrependingSrc->append(prependingSrc); 619 else 620 m_pendingSrc.prepend(prependingSrc); 621 622 #if PRELOAD_SCANNER_ENABLED 623 // We are stuck waiting for another script. Lets check the source that 624 // was just document.write()n for anything to load. 625 PreloadScanner documentWritePreloadScanner(m_doc); 626 documentWritePreloadScanner.begin(); 627 documentWritePreloadScanner.write(prependingSrc); 628 documentWritePreloadScanner.end(); 629 #endif 630 } else { 631 m_state = state; 632 write(prependingSrc, false); 633 state = m_state; 634 } 635 } 636 637 m_currentPrependingSrc = savedPrependingSrc; 638 639 return state; 640 } 641 642 HTMLTokenizer::State HTMLTokenizer::parseComment(SegmentedString& src, State state) 643 { 644 // FIXME: Why does this code even run for comments inside <script> and <style>? This seems bogus. 645 checkScriptBuffer(src.length()); 646 while (!src.isEmpty()) { 647 UChar ch = *src; 648 m_scriptCode[m_scriptCodeSize++] = ch; 649 if (ch == '>') { 650 bool handleBrokenComments = m_brokenComments && !(state.inScript() || state.inStyle()); 651 int endCharsCount = 1; // start off with one for the '>' character 652 if (m_scriptCodeSize > 2 && m_scriptCode[m_scriptCodeSize-3] == '-' && m_scriptCode[m_scriptCodeSize-2] == '-') { 653 endCharsCount = 3; 654 } else if (m_scriptCodeSize > 3 && m_scriptCode[m_scriptCodeSize-4] == '-' && m_scriptCode[m_scriptCodeSize-3] == '-' && 655 m_scriptCode[m_scriptCodeSize-2] == '!') { 656 // Other browsers will accept --!> as a close comment, even though it's 657 // not technically valid. 658 endCharsCount = 4; 659 } 660 if (handleBrokenComments || endCharsCount > 1) { 661 src.advancePastNonNewline(); 662 if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle() || state.inIFrame())) { 663 checkScriptBuffer(); 664 m_scriptCode[m_scriptCodeSize] = 0; 665 m_scriptCode[m_scriptCodeSize + 1] = 0; 666 m_currentToken.tagName = commentAtom; 667 m_currentToken.beginTag = true; 668 state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize - endCharsCount), state); 669 processToken(); 670 m_currentToken.tagName = commentAtom; 671 m_currentToken.beginTag = false; 672 processToken(); 673 m_scriptCodeSize = 0; 674 } 675 state.setInComment(false); 676 return state; // Finished parsing comment 677 } 678 } 679 src.advance(m_lineNumber); 680 } 681 682 return state; 683 } 684 685 HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state) 686 { 687 checkScriptBuffer(src.length()); 688 while (!src.isEmpty()) { 689 UChar ch = *src; 690 m_scriptCode[m_scriptCodeSize++] = ch; 691 if (ch == '>' && m_scriptCodeSize > 1 && m_scriptCode[m_scriptCodeSize - 2] == '%') { 692 src.advancePastNonNewline(); 693 state.setInServer(false); 694 m_scriptCodeSize = 0; 695 return state; // Finished parsing server include 696 } 697 src.advance(m_lineNumber); 698 } 699 return state; 700 } 701 702 HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString& src, State state) 703 { 704 UChar oldchar = 0; 705 while (!src.isEmpty()) { 706 UChar chbegin = *src; 707 if (chbegin == '\'') 708 tquote = tquote == SingleQuote ? NoQuote : SingleQuote; 709 else if (chbegin == '\"') 710 tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote; 711 // Look for '?>' 712 // Some crappy sites omit the "?" before it, so 713 // we look for an unquoted '>' instead. (IE compatible) 714 else if (chbegin == '>' && (!tquote || oldchar == '?')) { 715 // We got a '?>' sequence 716 state.setInProcessingInstruction(false); 717 src.advancePastNonNewline(); 718 state.setDiscardLF(true); 719 return state; // Finished parsing comment! 720 } 721 src.advance(m_lineNumber); 722 oldchar = chbegin; 723 } 724 725 return state; 726 } 727 728 HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString& src, State state) 729 { 730 while (!src.isEmpty()) { 731 UChar cc = *src; 732 733 if (state.skipLF()) { 734 state.setSkipLF(false); 735 if (cc == '\n') { 736 src.advancePastNewline(m_lineNumber); 737 continue; 738 } 739 } 740 741 // do we need to enlarge the buffer? 742 checkBuffer(); 743 744 if (cc == '\r') { 745 state.setSkipLF(true); 746 *m_dest++ = '\n'; 747 } else 748 *m_dest++ = cc; 749 src.advance(m_lineNumber); 750 } 751 752 return state; 753 } 754 755 756 HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString& src, UChar*& dest, State state, unsigned& cBufferPos, bool start, bool parsingTag) 757 { 758 if (start) { 759 cBufferPos = 0; 760 state.setEntityState(SearchEntity); 761 EntityUnicodeValue = 0; 762 } 763 764 while (!src.isEmpty()) { 765 UChar cc = *src; 766 switch (state.entityState()) { 767 case NoEntity: 768 ASSERT(state.entityState() != NoEntity); 769 return state; 770 771 case SearchEntity: 772 if (cc == '#') { 773 m_cBuffer[cBufferPos++] = cc; 774 src.advancePastNonNewline(); 775 state.setEntityState(NumericSearch); 776 } else 777 state.setEntityState(EntityName); 778 break; 779 780 case NumericSearch: 781 if (cc == 'x' || cc == 'X') { 782 m_cBuffer[cBufferPos++] = cc; 783 src.advancePastNonNewline(); 784 state.setEntityState(Hexadecimal); 785 } else if (cc >= '0' && cc <= '9') 786 state.setEntityState(Decimal); 787 else 788 state.setEntityState(SearchSemicolon); 789 break; 790 791 case Hexadecimal: { 792 int ll = min(src.length(), 10 - cBufferPos); 793 while (ll--) { 794 cc = *src; 795 if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) { 796 state.setEntityState(SearchSemicolon); 797 break; 798 } 799 int digit; 800 if (cc < 'A') 801 digit = cc - '0'; 802 else 803 digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch 804 EntityUnicodeValue = EntityUnicodeValue * 16 + digit; 805 m_cBuffer[cBufferPos++] = cc; 806 src.advancePastNonNewline(); 807 } 808 if (cBufferPos == 10) 809 state.setEntityState(SearchSemicolon); 810 break; 811 } 812 case Decimal: 813 { 814 int ll = min(src.length(), 9-cBufferPos); 815 while (ll--) { 816 cc = *src; 817 818 if (!(cc >= '0' && cc <= '9')) { 819 state.setEntityState(SearchSemicolon); 820 break; 821 } 822 823 EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0'); 824 m_cBuffer[cBufferPos++] = cc; 825 src.advancePastNonNewline(); 826 } 827 if (cBufferPos == 9) 828 state.setEntityState(SearchSemicolon); 829 break; 830 } 831 case EntityName: 832 { 833 int ll = min(src.length(), 9-cBufferPos); 834 while (ll--) { 835 cc = *src; 836 837 if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) { 838 state.setEntityState(SearchSemicolon); 839 break; 840 } 841 842 m_cBuffer[cBufferPos++] = cc; 843 src.advancePastNonNewline(); 844 } 845 if (cBufferPos == 9) 846 state.setEntityState(SearchSemicolon); 847 if (state.entityState() == SearchSemicolon) { 848 if (cBufferPos > 1) { 849 // Since the maximum length of entity name is 9, 850 // so a single char array which is allocated on 851 // the stack, its length is 10, should be OK. 852 // Also if we have an illegal character, we treat it 853 // as illegal entity name. 854 unsigned testedEntityNameLen = 0; 855 char tmpEntityNameBuffer[10]; 856 857 ASSERT(cBufferPos < 10); 858 for (; testedEntityNameLen < cBufferPos; ++testedEntityNameLen) { 859 if (m_cBuffer[testedEntityNameLen] > 0x7e) 860 break; 861 tmpEntityNameBuffer[testedEntityNameLen] = m_cBuffer[testedEntityNameLen]; 862 } 863 864 const Entity *e; 865 866 if (testedEntityNameLen == cBufferPos) 867 e = findEntity(tmpEntityNameBuffer, cBufferPos); 868 else 869 e = 0; 870 871 if (e) 872 EntityUnicodeValue = e->code; 873 874 // be IE compatible 875 if (parsingTag && EntityUnicodeValue > 255 && *src != ';') 876 EntityUnicodeValue = 0; 877 } 878 } 879 else 880 break; 881 } 882 case SearchSemicolon: 883 // Don't allow values that are more than 21 bits. 884 if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) { 885 if (!inViewSourceMode()) { 886 if (*src == ';') 887 src.advancePastNonNewline(); 888 if (EntityUnicodeValue <= 0xFFFF) { 889 checkBuffer(); 890 src.push(fixUpChar(EntityUnicodeValue)); 891 } else { 892 // Convert to UTF-16, using surrogate code points. 893 checkBuffer(2); 894 src.push(U16_LEAD(EntityUnicodeValue)); 895 src.push(U16_TRAIL(EntityUnicodeValue)); 896 } 897 } else { 898 // FIXME: We should eventually colorize entities by sending them as a special token. 899 // 12 bytes required: up to 10 bytes in m_cBuffer plus the 900 // leading '&' and trailing ';' 901 checkBuffer(12); 902 *dest++ = '&'; 903 for (unsigned i = 0; i < cBufferPos; i++) 904 dest[i] = m_cBuffer[i]; 905 dest += cBufferPos; 906 if (*src == ';') { 907 *dest++ = ';'; 908 src.advancePastNonNewline(); 909 } 910 } 911 } else { 912 // 11 bytes required: up to 10 bytes in m_cBuffer plus the 913 // leading '&' 914 checkBuffer(11); 915 // ignore the sequence, add it to the buffer as plaintext 916 *dest++ = '&'; 917 for (unsigned i = 0; i < cBufferPos; i++) 918 dest[i] = m_cBuffer[i]; 919 dest += cBufferPos; 920 } 921 922 state.setEntityState(NoEntity); 923 return state; 924 } 925 } 926 927 return state; 928 } 929 930 HTMLTokenizer::State HTMLTokenizer::parseDoctype(SegmentedString& src, State state) 931 { 932 ASSERT(state.inDoctype()); 933 while (!src.isEmpty() && state.inDoctype()) { 934 UChar c = *src; 935 bool isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' '; 936 switch (m_doctypeToken.state()) { 937 case DoctypeBegin: { 938 m_doctypeToken.setState(DoctypeBeforeName); 939 if (isWhitespace) { 940 src.advance(m_lineNumber); 941 if (inViewSourceMode()) 942 m_doctypeToken.m_source.append(c); 943 } 944 break; 945 } 946 case DoctypeBeforeName: { 947 if (c == '>') { 948 // Malformed. Just exit. 949 src.advancePastNonNewline(); 950 state.setInDoctype(false); 951 if (inViewSourceMode()) 952 processDoctypeToken(); 953 } else if (isWhitespace) { 954 src.advance(m_lineNumber); 955 if (inViewSourceMode()) 956 m_doctypeToken.m_source.append(c); 957 } else 958 m_doctypeToken.setState(DoctypeName); 959 break; 960 } 961 case DoctypeName: { 962 if (c == '>') { 963 // Valid doctype. Emit it. 964 src.advancePastNonNewline(); 965 state.setInDoctype(false); 966 processDoctypeToken(); 967 } else if (isWhitespace) { 968 m_doctypeSearchCount = 0; // Used now to scan for PUBLIC 969 m_doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM 970 m_doctypeToken.setState(DoctypeAfterName); 971 src.advance(m_lineNumber); 972 if (inViewSourceMode()) 973 m_doctypeToken.m_source.append(c); 974 } else { 975 src.advancePastNonNewline(); 976 m_doctypeToken.m_name.append(c); 977 if (inViewSourceMode()) 978 m_doctypeToken.m_source.append(c); 979 } 980 break; 981 } 982 case DoctypeAfterName: { 983 if (c == '>') { 984 // Valid doctype. Emit it. 985 src.advancePastNonNewline(); 986 state.setInDoctype(false); 987 processDoctypeToken(); 988 } else if (!isWhitespace) { 989 src.advancePastNonNewline(); 990 if (toASCIILower(c) == publicStart[m_doctypeSearchCount]) { 991 m_doctypeSearchCount++; 992 if (m_doctypeSearchCount == 6) 993 // Found 'PUBLIC' sequence 994 m_doctypeToken.setState(DoctypeBeforePublicID); 995 } else if (m_doctypeSearchCount > 0) { 996 m_doctypeSearchCount = 0; 997 m_doctypeToken.setState(DoctypeBogus); 998 } else if (toASCIILower(c) == systemStart[m_doctypeSecondarySearchCount]) { 999 m_doctypeSecondarySearchCount++; 1000 if (m_doctypeSecondarySearchCount == 6) 1001 // Found 'SYSTEM' sequence 1002 m_doctypeToken.setState(DoctypeBeforeSystemID); 1003 } else { 1004 m_doctypeSecondarySearchCount = 0; 1005 m_doctypeToken.setState(DoctypeBogus); 1006 } 1007 if (inViewSourceMode()) 1008 m_doctypeToken.m_source.append(c); 1009 } else { 1010 src.advance(m_lineNumber); // Whitespace keeps us in the after name state. 1011 if (inViewSourceMode()) 1012 m_doctypeToken.m_source.append(c); 1013 } 1014 break; 1015 } 1016 case DoctypeBeforePublicID: { 1017 if (c == '\"' || c == '\'') { 1018 tquote = c == '\"' ? DoubleQuote : SingleQuote; 1019 m_doctypeToken.setState(DoctypePublicID); 1020 src.advancePastNonNewline(); 1021 if (inViewSourceMode()) 1022 m_doctypeToken.m_source.append(c); 1023 } else if (c == '>') { 1024 // Considered bogus. Don't process the doctype. 1025 src.advancePastNonNewline(); 1026 state.setInDoctype(false); 1027 if (inViewSourceMode()) 1028 processDoctypeToken(); 1029 } else if (isWhitespace) { 1030 src.advance(m_lineNumber); 1031 if (inViewSourceMode()) 1032 m_doctypeToken.m_source.append(c); 1033 } else 1034 m_doctypeToken.setState(DoctypeBogus); 1035 break; 1036 } 1037 case DoctypePublicID: { 1038 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) { 1039 src.advancePastNonNewline(); 1040 m_doctypeToken.setState(DoctypeAfterPublicID); 1041 if (inViewSourceMode()) 1042 m_doctypeToken.m_source.append(c); 1043 } else if (c == '>') { 1044 // Considered bogus. Don't process the doctype. 1045 src.advancePastNonNewline(); 1046 state.setInDoctype(false); 1047 if (inViewSourceMode()) 1048 processDoctypeToken(); 1049 } else { 1050 m_doctypeToken.m_publicID.append(c); 1051 src.advance(m_lineNumber); 1052 if (inViewSourceMode()) 1053 m_doctypeToken.m_source.append(c); 1054 } 1055 break; 1056 } 1057 case DoctypeAfterPublicID: 1058 if (c == '\"' || c == '\'') { 1059 tquote = c == '\"' ? DoubleQuote : SingleQuote; 1060 m_doctypeToken.setState(DoctypeSystemID); 1061 src.advancePastNonNewline(); 1062 if (inViewSourceMode()) 1063 m_doctypeToken.m_source.append(c); 1064 } else if (c == '>') { 1065 // Valid doctype. Emit it now. 1066 src.advancePastNonNewline(); 1067 state.setInDoctype(false); 1068 processDoctypeToken(); 1069 } else if (isWhitespace) { 1070 src.advance(m_lineNumber); 1071 if (inViewSourceMode()) 1072 m_doctypeToken.m_source.append(c); 1073 } else 1074 m_doctypeToken.setState(DoctypeBogus); 1075 break; 1076 case DoctypeBeforeSystemID: 1077 if (c == '\"' || c == '\'') { 1078 tquote = c == '\"' ? DoubleQuote : SingleQuote; 1079 m_doctypeToken.setState(DoctypeSystemID); 1080 src.advancePastNonNewline(); 1081 if (inViewSourceMode()) 1082 m_doctypeToken.m_source.append(c); 1083 } else if (c == '>') { 1084 // Considered bogus. Don't process the doctype. 1085 src.advancePastNonNewline(); 1086 state.setInDoctype(false); 1087 } else if (isWhitespace) { 1088 src.advance(m_lineNumber); 1089 if (inViewSourceMode()) 1090 m_doctypeToken.m_source.append(c); 1091 } else 1092 m_doctypeToken.setState(DoctypeBogus); 1093 break; 1094 case DoctypeSystemID: 1095 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) { 1096 src.advancePastNonNewline(); 1097 m_doctypeToken.setState(DoctypeAfterSystemID); 1098 if (inViewSourceMode()) 1099 m_doctypeToken.m_source.append(c); 1100 } else if (c == '>') { 1101 // Considered bogus. Don't process the doctype. 1102 src.advancePastNonNewline(); 1103 state.setInDoctype(false); 1104 if (inViewSourceMode()) 1105 processDoctypeToken(); 1106 } else { 1107 m_doctypeToken.m_systemID.append(c); 1108 src.advance(m_lineNumber); 1109 if (inViewSourceMode()) 1110 m_doctypeToken.m_source.append(c); 1111 } 1112 break; 1113 case DoctypeAfterSystemID: 1114 if (c == '>') { 1115 // Valid doctype. Emit it now. 1116 src.advancePastNonNewline(); 1117 state.setInDoctype(false); 1118 processDoctypeToken(); 1119 } else if (isWhitespace) { 1120 src.advance(m_lineNumber); 1121 if (inViewSourceMode()) 1122 m_doctypeToken.m_source.append(c); 1123 } else 1124 m_doctypeToken.setState(DoctypeBogus); 1125 break; 1126 case DoctypeBogus: 1127 if (c == '>') { 1128 // Done with the bogus doctype. 1129 src.advancePastNonNewline(); 1130 state.setInDoctype(false); 1131 if (inViewSourceMode()) 1132 processDoctypeToken(); 1133 } else { 1134 src.advance(m_lineNumber); // Just keep scanning for '>' 1135 if (inViewSourceMode()) 1136 m_doctypeToken.m_source.append(c); 1137 } 1138 break; 1139 default: 1140 break; 1141 } 1142 } 1143 return state; 1144 } 1145 1146 HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString& src, State state) 1147 { 1148 ASSERT(!state.hasEntityState()); 1149 1150 unsigned cBufferPos = m_cBufferPos; 1151 1152 bool lastIsSlash = false; 1153 1154 while (!src.isEmpty()) { 1155 checkBuffer(); 1156 switch (state.tagState()) { 1157 case NoTag: 1158 { 1159 m_cBufferPos = cBufferPos; 1160 return state; 1161 } 1162 case TagName: 1163 { 1164 if (searchCount > 0) { 1165 if (*src == commentStart[searchCount]) { 1166 searchCount++; 1167 if (searchCount == 2) 1168 m_doctypeSearchCount++; // A '!' is also part of a doctype, so we are moving through that still as well. 1169 else 1170 m_doctypeSearchCount = 0; 1171 if (searchCount == 4) { 1172 // Found '<!--' sequence 1173 src.advancePastNonNewline(); 1174 m_dest = m_buffer; // ignore the previous part of this tag 1175 state.setInComment(true); 1176 state.setTagState(NoTag); 1177 1178 // Fix bug 34302 at kde.bugs.org. Go ahead and treat 1179 // <!--> as a valid comment, since both mozilla and IE on windows 1180 // can handle this case. Only do this in quirks mode. -dwh 1181 if (!src.isEmpty() && *src == '>' && m_doc->inCompatMode()) { 1182 state.setInComment(false); 1183 src.advancePastNonNewline(); 1184 if (!src.isEmpty()) 1185 m_cBuffer[cBufferPos++] = *src; 1186 } else 1187 state = parseComment(src, state); 1188 1189 m_cBufferPos = cBufferPos; 1190 return state; // Finished parsing tag! 1191 } 1192 m_cBuffer[cBufferPos++] = *src; 1193 src.advancePastNonNewline(); 1194 break; 1195 } else 1196 searchCount = 0; // Stop looking for '<!--' sequence 1197 } 1198 1199 if (m_doctypeSearchCount > 0) { 1200 if (toASCIILower(*src) == doctypeStart[m_doctypeSearchCount]) { 1201 m_doctypeSearchCount++; 1202 m_cBuffer[cBufferPos++] = *src; 1203 src.advancePastNonNewline(); 1204 if (m_doctypeSearchCount == 9) { 1205 // Found '<!DOCTYPE' sequence 1206 state.setInDoctype(true); 1207 state.setTagState(NoTag); 1208 m_doctypeToken.reset(); 1209 if (inViewSourceMode()) 1210 m_doctypeToken.m_source.append(m_cBuffer, cBufferPos); 1211 state = parseDoctype(src, state); 1212 m_cBufferPos = cBufferPos; 1213 return state; 1214 } 1215 break; 1216 } else 1217 m_doctypeSearchCount = 0; // Stop looking for '<!DOCTYPE' sequence 1218 } 1219 1220 bool finish = false; 1221 unsigned int ll = min(src.length(), CBUFLEN - cBufferPos); 1222 while (ll--) { 1223 UChar curchar = *src; 1224 if (isASCIISpace(curchar) || curchar == '>' || curchar == '<') { 1225 finish = true; 1226 break; 1227 } 1228 1229 // tolower() shows up on profiles. This is faster! 1230 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode()) 1231 m_cBuffer[cBufferPos++] = curchar + ('a' - 'A'); 1232 else 1233 m_cBuffer[cBufferPos++] = curchar; 1234 src.advancePastNonNewline(); 1235 } 1236 1237 // Disadvantage: we add the possible rest of the tag 1238 // as attribute names. ### judge if this causes problems 1239 if (finish || CBUFLEN == cBufferPos) { 1240 bool beginTag; 1241 UChar* ptr = m_cBuffer; 1242 unsigned int len = cBufferPos; 1243 m_cBuffer[cBufferPos] = '\0'; 1244 if ((cBufferPos > 0) && (*ptr == '/')) { 1245 // End Tag 1246 beginTag = false; 1247 ptr++; 1248 len--; 1249 } 1250 else 1251 // Start Tag 1252 beginTag = true; 1253 1254 // Ignore the / in fake xml tags like <br/>. We trim off the "/" so that we'll get "br" as the tag name and not "br/". 1255 if (len > 1 && ptr[len-1] == '/' && !inViewSourceMode()) 1256 ptr[--len] = '\0'; 1257 1258 // Now that we've shaved off any invalid / that might have followed the name), make the tag. 1259 // FIXME: FireFox and WinIE turn !foo nodes into comments, we ignore comments. (fast/parser/tag-with-exclamation-point.html) 1260 if (ptr[0] != '!' || inViewSourceMode()) { 1261 m_currentToken.tagName = AtomicString(ptr); 1262 m_currentToken.beginTag = beginTag; 1263 } 1264 m_dest = m_buffer; 1265 state.setTagState(SearchAttribute); 1266 cBufferPos = 0; 1267 } 1268 break; 1269 } 1270 case SearchAttribute: 1271 while (!src.isEmpty()) { 1272 UChar curchar = *src; 1273 // In this mode just ignore any quotes we encounter and treat them like spaces. 1274 if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"') { 1275 if (curchar == '<' || curchar == '>') 1276 state.setTagState(SearchEnd); 1277 else 1278 state.setTagState(AttributeName); 1279 1280 cBufferPos = 0; 1281 break; 1282 } 1283 if (inViewSourceMode()) 1284 m_currentToken.addViewSourceChar(curchar); 1285 src.advance(m_lineNumber); 1286 } 1287 break; 1288 case AttributeName: 1289 { 1290 m_rawAttributeBeforeValue.clear(); 1291 int ll = min(src.length(), CBUFLEN - cBufferPos); 1292 while (ll--) { 1293 UChar curchar = *src; 1294 // If we encounter a "/" when scanning an attribute name, treat it as a delimiter. This allows the 1295 // cases like <input type=checkbox checked/> to work (and accommodates XML-style syntax as per HTML5). 1296 if (curchar <= '>' && (curchar >= '<' || isASCIISpace(curchar) || curchar == '/')) { 1297 m_cBuffer[cBufferPos] = '\0'; 1298 m_attrName = AtomicString(m_cBuffer); 1299 m_dest = m_buffer; 1300 *m_dest++ = 0; 1301 state.setTagState(SearchEqual); 1302 if (inViewSourceMode()) 1303 m_currentToken.addViewSourceChar('a'); 1304 break; 1305 } 1306 1307 // tolower() shows up on profiles. This is faster! 1308 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode()) 1309 m_cBuffer[cBufferPos++] = curchar + ('a' - 'A'); 1310 else 1311 m_cBuffer[cBufferPos++] = curchar; 1312 1313 m_rawAttributeBeforeValue.append(curchar); 1314 src.advance(m_lineNumber); 1315 } 1316 if (cBufferPos == CBUFLEN) { 1317 m_cBuffer[cBufferPos] = '\0'; 1318 m_attrName = AtomicString(m_cBuffer); 1319 m_dest = m_buffer; 1320 *m_dest++ = 0; 1321 state.setTagState(SearchEqual); 1322 if (inViewSourceMode()) 1323 m_currentToken.addViewSourceChar('a'); 1324 } 1325 break; 1326 } 1327 case SearchEqual: 1328 while (!src.isEmpty()) { 1329 UChar curchar = *src; 1330 1331 if (lastIsSlash && curchar == '>') { 1332 // This is a quirk (with a long sad history). We have to do this 1333 // since widgets do <script src="foo.js"/> and expect the tag to close. 1334 if (m_currentToken.tagName == scriptTag) 1335 m_currentToken.selfClosingTag = true; 1336 m_currentToken.brokenXMLStyle = true; 1337 } 1338 1339 // In this mode just ignore any quotes or slashes we encounter and treat them like spaces. 1340 if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"' && curchar != '/') { 1341 if (curchar == '=') { 1342 state.setTagState(SearchValue); 1343 if (inViewSourceMode()) 1344 m_currentToken.addViewSourceChar(curchar); 1345 m_rawAttributeBeforeValue.append(curchar); 1346 src.advancePastNonNewline(); 1347 } else { 1348 m_currentToken.addAttribute(m_attrName, emptyAtom, inViewSourceMode()); 1349 m_dest = m_buffer; 1350 state.setTagState(SearchAttribute); 1351 lastIsSlash = false; 1352 } 1353 break; 1354 } 1355 1356 lastIsSlash = curchar == '/'; 1357 1358 if (inViewSourceMode()) 1359 m_currentToken.addViewSourceChar(curchar); 1360 m_rawAttributeBeforeValue.append(curchar); 1361 src.advance(m_lineNumber); 1362 } 1363 break; 1364 case SearchValue: 1365 while (!src.isEmpty()) { 1366 UChar curchar = *src; 1367 if (!isASCIISpace(curchar)) { 1368 if (curchar == '\'' || curchar == '\"') { 1369 tquote = curchar == '\"' ? DoubleQuote : SingleQuote; 1370 state.setTagState(QuotedValue); 1371 if (inViewSourceMode()) 1372 m_currentToken.addViewSourceChar(curchar); 1373 m_rawAttributeBeforeValue.append(curchar); 1374 src.advancePastNonNewline(); 1375 } else 1376 state.setTagState(Value); 1377 1378 break; 1379 } 1380 if (inViewSourceMode()) 1381 m_currentToken.addViewSourceChar(curchar); 1382 m_rawAttributeBeforeValue.append(curchar); 1383 src.advance(m_lineNumber); 1384 } 1385 break; 1386 case QuotedValue: 1387 while (!src.isEmpty()) { 1388 checkBuffer(); 1389 1390 UChar curchar = *src; 1391 if (curchar <= '>' && !src.escaped()) { 1392 if (curchar == '>' && m_attrName.isEmpty()) { 1393 // Handle a case like <img '>. Just go ahead and be willing 1394 // to close the whole tag. Don't consume the character and 1395 // just go back into SearchEnd while ignoring the whole 1396 // value. 1397 // FIXME: Note that this is actually not a very good solution. 1398 // It doesn't handle the general case of 1399 // unmatched quotes among attributes that have names. -dwh 1400 while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r')) 1401 m_dest--; // remove trailing newlines 1402 AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1); 1403 if (!attributeValue.contains('/')) 1404 m_attrName = attributeValue; // Just make the name/value match. (FIXME: Is this some WinIE quirk?) 1405 m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode()); 1406 if (inViewSourceMode()) 1407 m_currentToken.addViewSourceChar('x'); 1408 state.setTagState(SearchAttribute); 1409 m_dest = m_buffer; 1410 tquote = NoQuote; 1411 break; 1412 } 1413 1414 if (curchar == '&') { 1415 src.advancePastNonNewline(); 1416 state = parseEntity(src, m_dest, state, cBufferPos, true, true); 1417 break; 1418 } 1419 1420 if ((tquote == SingleQuote && curchar == '\'') || (tquote == DoubleQuote && curchar == '\"')) { 1421 // some <input type=hidden> rely on trailing spaces. argh 1422 while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r')) 1423 m_dest--; // remove trailing newlines 1424 AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1); 1425 if (m_attrName.isEmpty() && !attributeValue.contains('/')) { 1426 m_attrName = attributeValue; // Make the name match the value. (FIXME: Is this a WinIE quirk?) 1427 if (inViewSourceMode()) 1428 m_currentToken.addViewSourceChar('x'); 1429 } else if (inViewSourceMode()) 1430 m_currentToken.addViewSourceChar('v'); 1431 1432 if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode() && m_attrName == srcAttr) { 1433 String context(m_rawAttributeBeforeValue.data(), m_rawAttributeBeforeValue.size()); 1434 if (m_XSSAuditor && !m_XSSAuditor->canLoadExternalScriptFromSrc(context, attributeValue)) 1435 attributeValue = blankURL().string(); 1436 } 1437 1438 m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode()); 1439 m_dest = m_buffer; 1440 state.setTagState(SearchAttribute); 1441 tquote = NoQuote; 1442 if (inViewSourceMode()) 1443 m_currentToken.addViewSourceChar(curchar); 1444 src.advancePastNonNewline(); 1445 break; 1446 } 1447 } 1448 1449 *m_dest++ = curchar; 1450 src.advance(m_lineNumber); 1451 } 1452 break; 1453 case Value: 1454 while (!src.isEmpty()) { 1455 checkBuffer(); 1456 UChar curchar = *src; 1457 if (curchar <= '>' && !src.escaped()) { 1458 // parse Entities 1459 if (curchar == '&') { 1460 src.advancePastNonNewline(); 1461 state = parseEntity(src, m_dest, state, cBufferPos, true, true); 1462 break; 1463 } 1464 // no quotes. Every space means end of value 1465 // '/' does not delimit in IE! 1466 if (isASCIISpace(curchar) || curchar == '>') { 1467 AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1); 1468 1469 if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode() && m_attrName == srcAttr) { 1470 String context(m_rawAttributeBeforeValue.data(), m_rawAttributeBeforeValue.size()); 1471 if (m_XSSAuditor && !m_XSSAuditor->canLoadExternalScriptFromSrc(context, attributeValue)) 1472 attributeValue = blankURL().string(); 1473 } 1474 1475 m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode()); 1476 if (inViewSourceMode()) 1477 m_currentToken.addViewSourceChar('v'); 1478 m_dest = m_buffer; 1479 state.setTagState(SearchAttribute); 1480 break; 1481 } 1482 } 1483 1484 *m_dest++ = curchar; 1485 src.advance(m_lineNumber); 1486 } 1487 break; 1488 case SearchEnd: 1489 { 1490 while (!src.isEmpty()) { 1491 UChar ch = *src; 1492 if (ch == '>' || ch == '<') 1493 break; 1494 if (ch == '/') 1495 m_currentToken.selfClosingTag = true; 1496 if (inViewSourceMode()) 1497 m_currentToken.addViewSourceChar(ch); 1498 src.advance(m_lineNumber); 1499 } 1500 if (src.isEmpty()) 1501 break; 1502 1503 searchCount = 0; // Stop looking for '<!--' sequence 1504 state.setTagState(NoTag); 1505 tquote = NoQuote; 1506 1507 if (*src != '<') 1508 src.advance(m_lineNumber); 1509 1510 if (m_currentToken.tagName == nullAtom) { //stop if tag is unknown 1511 m_cBufferPos = cBufferPos; 1512 return state; 1513 } 1514 1515 AtomicString tagName = m_currentToken.tagName; 1516 1517 // Handle <script src="foo"/> like Mozilla/Opera. We have to do this now for Dashboard 1518 // compatibility. 1519 bool isSelfClosingScript = m_currentToken.selfClosingTag && m_currentToken.beginTag && m_currentToken.tagName == scriptTag; 1520 bool beginTag = !m_currentToken.selfClosingTag && m_currentToken.beginTag; 1521 if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode()) { 1522 Attribute* a = 0; 1523 m_scriptTagSrcAttrValue = String(); 1524 m_scriptTagCharsetAttrValue = String(); 1525 if (m_currentToken.attrs && !m_fragment) { 1526 if (m_doc->frame() && m_doc->frame()->script()->canExecuteScripts()) { 1527 if ((a = m_currentToken.attrs->getAttributeItem(srcAttr))) 1528 m_scriptTagSrcAttrValue = m_doc->completeURL(deprecatedParseURL(a->value())).string(); 1529 } 1530 } 1531 } 1532 1533 RefPtr<Node> n = processToken(); 1534 m_cBufferPos = cBufferPos; 1535 if (n || inViewSourceMode()) { 1536 State savedState = state; 1537 SegmentedString savedSrc = src; 1538 long savedLineno = m_lineNumber; 1539 if ((tagName == preTag || tagName == listingTag) && !inViewSourceMode()) { 1540 if (beginTag) 1541 state.setDiscardLF(true); // Discard the first LF after we open a pre. 1542 } else if (tagName == scriptTag) { 1543 ASSERT(!m_scriptNode); 1544 m_scriptNode = static_pointer_cast<HTMLScriptElement>(n); 1545 if (m_scriptNode) 1546 m_scriptTagCharsetAttrValue = m_scriptNode->scriptCharset(); 1547 if (beginTag) { 1548 m_searchStopper = scriptEnd; 1549 m_searchStopperLength = 8; 1550 state.setInScript(true); 1551 state = parseNonHTMLText(src, state); 1552 } else if (isSelfClosingScript) { // Handle <script src="foo"/> 1553 state.setInScript(true); 1554 state = scriptHandler(state); 1555 } 1556 } else if (tagName == styleTag) { 1557 if (beginTag) { 1558 m_searchStopper = styleEnd; 1559 m_searchStopperLength = 7; 1560 state.setInStyle(true); 1561 state = parseNonHTMLText(src, state); 1562 } 1563 } else if (tagName == textareaTag) { 1564 if (beginTag) { 1565 m_searchStopper = textareaEnd; 1566 m_searchStopperLength = 10; 1567 state.setInTextArea(true); 1568 state = parseNonHTMLText(src, state); 1569 } 1570 } else if (tagName == titleTag) { 1571 if (beginTag) { 1572 m_searchStopper = titleEnd; 1573 m_searchStopperLength = 7; 1574 state.setInTitle(true); 1575 state = parseNonHTMLText(src, state); 1576 } 1577 } else if (tagName == xmpTag) { 1578 if (beginTag) { 1579 m_searchStopper = xmpEnd; 1580 m_searchStopperLength = 5; 1581 state.setInXmp(true); 1582 state = parseNonHTMLText(src, state); 1583 } 1584 } else if (tagName == iframeTag) { 1585 if (beginTag) { 1586 m_searchStopper = iframeEnd; 1587 m_searchStopperLength = 8; 1588 state.setInIFrame(true); 1589 state = parseNonHTMLText(src, state); 1590 } 1591 } 1592 if (src.isEmpty() && (state.inTitle() || inViewSourceMode()) && !state.inComment() && !(state.inScript() && m_currentScriptTagStartLineNumber)) { 1593 // We just ate the rest of the document as the #text node under the special tag! 1594 // Reset the state then retokenize without special handling. 1595 // Let the parser clean up the missing close tag. 1596 // FIXME: This is incorrect, because src.isEmpty() doesn't mean we're 1597 // at the end of the document unless m_noMoreData is also true. We need 1598 // to detect this case elsewhere, and save the state somewhere other 1599 // than a local variable. 1600 state = savedState; 1601 src = savedSrc; 1602 m_lineNumber = savedLineno; 1603 m_scriptCodeSize = 0; 1604 } 1605 } 1606 if (tagName == plaintextTag) 1607 state.setInPlainText(beginTag); 1608 return state; // Finished parsing tag! 1609 } 1610 } // end switch 1611 } 1612 m_cBufferPos = cBufferPos; 1613 return state; 1614 } 1615 1616 inline bool HTMLTokenizer::continueProcessing(int& processedCount, double startTime, State &state) 1617 { 1618 // We don't want to be checking elapsed time with every character, so we only check after we've 1619 // processed a certain number of characters. 1620 bool allowedYield = state.allowYield(); 1621 state.setAllowYield(false); 1622 if (!state.loadingExtScript() && !state.forceSynchronous() && !m_executingScript && (processedCount > m_tokenizerChunkSize || allowedYield)) { 1623 processedCount = 0; 1624 if (currentTime() - startTime > m_tokenizerTimeDelay) { 1625 /* FIXME: We'd like to yield aggressively to give stylesheets the opportunity to 1626 load, but this hurts overall performance on slower machines. For now turn this 1627 off. 1628 || (!m_doc->haveStylesheetsLoaded() && 1629 (m_doc->documentElement()->id() != ID_HTML || m_doc->body()))) {*/ 1630 // Schedule the timer to keep processing as soon as possible. 1631 m_timer.startOneShot(0); 1632 #ifdef INSTRUMENT_LAYOUT_SCHEDULING 1633 if (currentTime() - startTime > m_tokenizerTimeDelay) 1634 printf("Deferring processing of data because 500ms elapsed away from event loop.\n"); 1635 #endif 1636 return false; 1637 } 1638 } 1639 1640 processedCount++; 1641 return true; 1642 } 1643 1644 void HTMLTokenizer::write(const SegmentedString& str, bool appendData) 1645 { 1646 if (!m_buffer) 1647 return; 1648 1649 if (m_parserStopped) 1650 return; 1651 1652 SegmentedString source(str); 1653 if (m_executingScript) 1654 source.setExcludeLineNumbers(); 1655 1656 if ((m_executingScript && appendData) || !m_pendingScripts.isEmpty()) { 1657 // don't parse; we will do this later 1658 if (m_currentPrependingSrc) 1659 m_currentPrependingSrc->append(source); 1660 else { 1661 m_pendingSrc.append(source); 1662 #if PRELOAD_SCANNER_ENABLED 1663 if (m_preloadScanner && m_preloadScanner->inProgress() && appendData) 1664 m_preloadScanner->write(source); 1665 #endif 1666 } 1667 return; 1668 } 1669 1670 #if PRELOAD_SCANNER_ENABLED 1671 if (m_preloadScanner && m_preloadScanner->inProgress() && appendData) 1672 m_preloadScanner->end(); 1673 #endif 1674 1675 if (!m_src.isEmpty()) 1676 m_src.append(source); 1677 else 1678 setSrc(source); 1679 1680 // Once a timer is set, it has control of when the tokenizer continues. 1681 if (m_timer.isActive()) 1682 return; 1683 1684 bool wasInWrite = m_inWrite; 1685 m_inWrite = true; 1686 1687 #ifdef INSTRUMENT_LAYOUT_SCHEDULING 1688 if (!m_doc->ownerElement()) 1689 printf("Beginning write at time %d\n", m_doc->elapsedTime()); 1690 #endif 1691 1692 int processedCount = 0; 1693 double startTime = currentTime(); 1694 #ifdef ANDROID_INSTRUMENT 1695 android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter); 1696 #endif 1697 1698 #if ENABLE(INSPECTOR) 1699 if (InspectorTimelineAgent* timelineAgent = m_doc->inspectorTimelineAgent()) 1700 timelineAgent->willWriteHTML(source.length(), m_lineNumber); 1701 #endif 1702 1703 Frame* frame = m_doc->frame(); 1704 1705 State state = m_state; 1706 1707 while (!m_src.isEmpty() && (!frame || !frame->redirectScheduler()->locationChangePending())) { 1708 if (!continueProcessing(processedCount, startTime, state)) 1709 break; 1710 1711 // do we need to enlarge the buffer? 1712 checkBuffer(); 1713 1714 UChar cc = *m_src; 1715 1716 bool wasSkipLF = state.skipLF(); 1717 if (wasSkipLF) 1718 state.setSkipLF(false); 1719 1720 if (wasSkipLF && (cc == '\n')) 1721 m_src.advance(); 1722 else if (state.needsSpecialWriteHandling()) { 1723 // it's important to keep needsSpecialWriteHandling with the flags this block tests 1724 if (state.hasEntityState()) 1725 state = parseEntity(m_src, m_dest, state, m_cBufferPos, false, state.hasTagState()); 1726 else if (state.inPlainText()) 1727 state = parseText(m_src, state); 1728 else if (state.inAnyNonHTMLText()) 1729 state = parseNonHTMLText(m_src, state); 1730 else if (state.inComment()) 1731 state = parseComment(m_src, state); 1732 else if (state.inDoctype()) 1733 state = parseDoctype(m_src, state); 1734 else if (state.inServer()) 1735 state = parseServer(m_src, state); 1736 else if (state.inProcessingInstruction()) 1737 state = parseProcessingInstruction(m_src, state); 1738 else if (state.hasTagState()) 1739 state = parseTag(m_src, state); 1740 else if (state.startTag()) { 1741 state.setStartTag(false); 1742 1743 switch (cc) { 1744 case '/': 1745 break; 1746 case '!': { 1747 // <!-- comment --> or <!DOCTYPE ...> 1748 searchCount = 1; // Look for '<!--' sequence to start comment or '<!DOCTYPE' sequence to start doctype 1749 m_doctypeSearchCount = 1; 1750 break; 1751 } 1752 case '?': { 1753 // xml processing instruction 1754 state.setInProcessingInstruction(true); 1755 tquote = NoQuote; 1756 state = parseProcessingInstruction(m_src, state); 1757 continue; 1758 1759 break; 1760 } 1761 case '%': 1762 if (!m_brokenServer) { 1763 // <% server stuff, handle as comment %> 1764 state.setInServer(true); 1765 tquote = NoQuote; 1766 state = parseServer(m_src, state); 1767 continue; 1768 } 1769 // else fall through 1770 default: { 1771 if ( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) { 1772 // Start of a Start-Tag 1773 } else { 1774 // Invalid tag 1775 // Add as is 1776 *m_dest = '<'; 1777 m_dest++; 1778 continue; 1779 } 1780 } 1781 }; // end case 1782 1783 processToken(); 1784 1785 m_cBufferPos = 0; 1786 state.setTagState(TagName); 1787 state = parseTag(m_src, state); 1788 } 1789 } else if (cc == '&' && !m_src.escaped()) { 1790 m_src.advancePastNonNewline(); 1791 state = parseEntity(m_src, m_dest, state, m_cBufferPos, true, state.hasTagState()); 1792 } else if (cc == '<' && !m_src.escaped()) { 1793 m_currentTagStartLineNumber = m_lineNumber; 1794 m_src.advancePastNonNewline(); 1795 state.setStartTag(true); 1796 state.setDiscardLF(false); 1797 } else if (cc == '\n' || cc == '\r') { 1798 if (state.discardLF()) 1799 // Ignore this LF 1800 state.setDiscardLF(false); // We have discarded 1 LF 1801 else { 1802 // Process this LF 1803 *m_dest++ = '\n'; 1804 if (cc == '\r' && !m_src.excludeLineNumbers()) 1805 m_lineNumber++; 1806 } 1807 1808 /* Check for MS-DOS CRLF sequence */ 1809 if (cc == '\r') 1810 state.setSkipLF(true); 1811 m_src.advance(m_lineNumber); 1812 } else { 1813 state.setDiscardLF(false); 1814 *m_dest++ = cc; 1815 m_src.advancePastNonNewline(); 1816 } 1817 } 1818 1819 #ifdef INSTRUMENT_LAYOUT_SCHEDULING 1820 if (!m_doc->ownerElement()) 1821 printf("Ending write at time %d\n", m_doc->elapsedTime()); 1822 #endif 1823 1824 #if ENABLE(INSPECTOR) 1825 if (InspectorTimelineAgent* timelineAgent = m_doc->inspectorTimelineAgent()) 1826 timelineAgent->didWriteHTML(m_lineNumber); 1827 #endif 1828 1829 m_inWrite = wasInWrite; 1830 1831 m_state = state; 1832 1833 #ifdef ANDROID_INSTRUMENT 1834 android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__); 1835 #endif 1836 1837 if (m_noMoreData && !m_inWrite && !state.loadingExtScript() && !m_executingScript && !m_timer.isActive()) 1838 end(); // this actually causes us to be deleted 1839 1840 // After parsing, go ahead and dispatch image beforeload events. 1841 ImageLoader::dispatchPendingBeforeLoadEvents(); 1842 } 1843 1844 void HTMLTokenizer::stopParsing() 1845 { 1846 Tokenizer::stopParsing(); 1847 m_timer.stop(); 1848 1849 // The part needs to know that the tokenizer has finished with its data, 1850 // regardless of whether it happened naturally or due to manual intervention. 1851 if (!m_fragment && m_doc->frame()) 1852 m_doc->frame()->loader()->tokenizerProcessedData(); 1853 } 1854 1855 bool HTMLTokenizer::processingData() const 1856 { 1857 return m_timer.isActive() || m_inWrite; 1858 } 1859 1860 void HTMLTokenizer::timerFired(Timer<HTMLTokenizer>*) 1861 { 1862 #ifdef INSTRUMENT_LAYOUT_SCHEDULING 1863 if (!m_doc->ownerElement()) 1864 printf("Beginning timer write at time %d\n", m_doc->elapsedTime()); 1865 #endif 1866 1867 if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) { 1868 // Restart the timer and let layout win. This is basically a way of ensuring that the layout 1869 // timer has higher priority than our timer. 1870 m_timer.startOneShot(0); 1871 return; 1872 } 1873 1874 // Invoke write() as though more data came in. This might cause us to get deleted. 1875 write(SegmentedString(), true); 1876 } 1877 1878 void HTMLTokenizer::end() 1879 { 1880 ASSERT(!m_timer.isActive()); 1881 m_timer.stop(); // Only helps if assertion above fires, but do it anyway. 1882 1883 if (m_buffer) { 1884 // parseTag is using the buffer for different matters 1885 if (!m_state.hasTagState()) 1886 processToken(); 1887 1888 fastFree(m_scriptCode); 1889 m_scriptCode = 0; 1890 m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0; 1891 1892 fastFree(m_buffer); 1893 m_buffer = 0; 1894 } 1895 1896 if (!inViewSourceMode()) 1897 m_parser->finished(); 1898 else 1899 m_doc->finishedParsing(); 1900 } 1901 1902 void HTMLTokenizer::finish() 1903 { 1904 // do this as long as we don't find matching comment ends 1905 while ((m_state.inComment() || m_state.inServer()) && m_scriptCode && m_scriptCodeSize) { 1906 // we've found an unmatched comment start 1907 if (m_state.inComment()) 1908 m_brokenComments = true; 1909 else 1910 m_brokenServer = true; 1911 checkScriptBuffer(); 1912 m_scriptCode[m_scriptCodeSize] = 0; 1913 m_scriptCode[m_scriptCodeSize + 1] = 0; 1914 int pos; 1915 String food; 1916 if (m_state.inScript() || m_state.inStyle() || m_state.inTextArea()) 1917 food = String(m_scriptCode, m_scriptCodeSize); 1918 else if (m_state.inServer()) { 1919 food = "<"; 1920 food.append(m_scriptCode, m_scriptCodeSize); 1921 } else { 1922 pos = find(m_scriptCode, m_scriptCodeSize, '>'); 1923 food = String(m_scriptCode + pos + 1, m_scriptCodeSize - pos - 1); 1924 } 1925 fastFree(m_scriptCode); 1926 m_scriptCode = 0; 1927 m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0; 1928 m_state.setInComment(false); 1929 m_state.setInServer(false); 1930 if (!food.isEmpty()) 1931 write(food, true); 1932 } 1933 // this indicates we will not receive any more data... but if we are waiting on 1934 // an external script to load, we can't finish parsing until that is done 1935 m_noMoreData = true; 1936 if (!m_inWrite && !m_state.loadingExtScript() && !m_executingScript && !m_timer.isActive()) 1937 end(); // this actually causes us to be deleted 1938 } 1939 1940 PassRefPtr<Node> HTMLTokenizer::processToken() 1941 { 1942 ScriptController* scriptController = (!m_fragment && m_doc->frame()) ? m_doc->frame()->script() : 0; 1943 if (scriptController && scriptController->canExecuteScripts()) 1944 // FIXME: Why isn't this m_currentScriptTagStartLineNumber? I suspect this is wrong. 1945 scriptController->setEventHandlerLineNumber(m_currentTagStartLineNumber + 1); // Script line numbers are 1 based. 1946 if (m_dest > m_buffer) { 1947 m_currentToken.text = StringImpl::createStrippingNullCharacters(m_buffer, m_dest - m_buffer); 1948 if (m_currentToken.tagName != commentAtom) 1949 m_currentToken.tagName = textAtom; 1950 } else if (m_currentToken.tagName == nullAtom) { 1951 m_currentToken.reset(); 1952 if (scriptController) 1953 scriptController->setEventHandlerLineNumber(m_lineNumber + 1); // Script line numbers are 1 based. 1954 return 0; 1955 } 1956 1957 m_dest = m_buffer; 1958 1959 RefPtr<Node> n; 1960 1961 if (!m_parserStopped) { 1962 if (NamedMappedAttrMap* map = m_currentToken.attrs.get()) 1963 map->shrinkToLength(); 1964 if (inViewSourceMode()) 1965 static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceToken(&m_currentToken); 1966 else 1967 // pass the token over to the parser, the parser DOES NOT delete the token 1968 n = m_parser->parseToken(&m_currentToken); 1969 } 1970 m_currentToken.reset(); 1971 if (scriptController) 1972 scriptController->setEventHandlerLineNumber(0); 1973 1974 return n.release(); 1975 } 1976 1977 void HTMLTokenizer::processDoctypeToken() 1978 { 1979 if (inViewSourceMode()) 1980 static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceDoctypeToken(&m_doctypeToken); 1981 else 1982 m_parser->parseDoctypeToken(&m_doctypeToken); 1983 } 1984 1985 HTMLTokenizer::~HTMLTokenizer() 1986 { 1987 ASSERT(!m_inWrite); 1988 reset(); 1989 } 1990 1991 1992 void HTMLTokenizer::enlargeBuffer(int len) 1993 { 1994 // Resize policy: Always at least double the size of the buffer each time. 1995 int delta = max(len, m_bufferSize); 1996 1997 // Check for overflow. 1998 // For now, handle overflow the same way we handle fastRealloc failure, with CRASH. 1999 static const int maxSize = INT_MAX / sizeof(UChar); 2000 if (delta > maxSize - m_bufferSize) 2001 CRASH(); 2002 2003 int newSize = m_bufferSize + delta; 2004 int oldOffset = m_dest - m_buffer; 2005 m_buffer = static_cast<UChar*>(fastRealloc(m_buffer, newSize * sizeof(UChar))); 2006 m_dest = m_buffer + oldOffset; 2007 m_bufferSize = newSize; 2008 } 2009 2010 void HTMLTokenizer::enlargeScriptBuffer(int len) 2011 { 2012 // Resize policy: Always at least double the size of the buffer each time. 2013 int delta = max(len, m_scriptCodeCapacity); 2014 2015 // Check for overflow. 2016 // For now, handle overflow the same way we handle fastRealloc failure, with CRASH. 2017 static const int maxSize = INT_MAX / sizeof(UChar); 2018 if (delta > maxSize - m_scriptCodeCapacity) 2019 CRASH(); 2020 2021 int newSize = m_scriptCodeCapacity + delta; 2022 // If we allow fastRealloc(ptr, 0), it will call CRASH(). We run into this 2023 // case if the HTML being parsed begins with "<!--" and there's more data 2024 // coming. 2025 if (!newSize) { 2026 ASSERT(!m_scriptCode); 2027 return; 2028 } 2029 2030 m_scriptCode = static_cast<UChar*>(fastRealloc(m_scriptCode, newSize * sizeof(UChar))); 2031 m_scriptCodeCapacity = newSize; 2032 } 2033 2034 void HTMLTokenizer::executeScriptsWaitingForStylesheets() 2035 { 2036 ASSERT(m_doc->haveStylesheetsLoaded()); 2037 2038 if (m_hasScriptsWaitingForStylesheets) 2039 notifyFinished(0); 2040 } 2041 2042 void HTMLTokenizer::notifyFinished(CachedResource*) 2043 { 2044 executeExternalScriptsIfReady(); 2045 } 2046 2047 void HTMLTokenizer::executeExternalScriptsIfReady() 2048 { 2049 #ifdef INSTRUMENT_LAYOUT_SCHEDULING 2050 if (!m_doc->ownerElement()) 2051 printf("script loaded at %d\n", m_doc->elapsedTime()); 2052 #endif 2053 2054 ASSERT(!m_pendingScripts.isEmpty()); 2055 2056 // Make external scripts wait for external stylesheets. 2057 // FIXME: This needs to be done for inline scripts too. 2058 m_hasScriptsWaitingForStylesheets = !m_doc->haveStylesheetsLoaded(); 2059 if (m_hasScriptsWaitingForStylesheets) 2060 return; 2061 2062 bool finished = false; 2063 2064 double startTime = currentTime(); 2065 while (!finished && m_pendingScripts.first()->isLoaded()) { 2066 if (!continueExecutingExternalScripts(startTime)) 2067 break; 2068 2069 CachedScript* cs = m_pendingScripts.first().get(); 2070 m_pendingScripts.removeFirst(); 2071 ASSERT(cache()->disabled() || cs->accessCount() > 0); 2072 2073 setSrc(SegmentedString()); 2074 2075 // make sure we forget about the script before we execute the new one 2076 // infinite recursion might happen otherwise 2077 ScriptSourceCode sourceCode(cs); 2078 bool errorOccurred = cs->errorOccurred(); 2079 cs->removeClient(this); 2080 2081 RefPtr<Node> n = m_scriptNode.release(); 2082 2083 #ifdef INSTRUMENT_LAYOUT_SCHEDULING 2084 if (!m_doc->ownerElement()) 2085 printf("external script beginning execution at %d\n", m_doc->elapsedTime()); 2086 #endif 2087 2088 if (errorOccurred) 2089 n->dispatchEvent(Event::create(eventNames().errorEvent, true, false)); 2090 else { 2091 if (static_cast<HTMLScriptElement*>(n.get())->shouldExecuteAsJavaScript()) 2092 m_state = scriptExecution(sourceCode, m_state); 2093 #if ENABLE(XHTMLMP) 2094 else 2095 m_doc->setShouldProcessNoscriptElement(true); 2096 #endif 2097 n->dispatchEvent(Event::create(eventNames().loadEvent, false, false)); 2098 } 2099 2100 // The state of m_pendingScripts.isEmpty() can change inside the scriptExecution() 2101 // call above, so test afterwards. 2102 finished = m_pendingScripts.isEmpty(); 2103 if (finished) { 2104 ASSERT(!m_hasScriptsWaitingForStylesheets); 2105 m_state.setLoadingExtScript(false); 2106 #ifdef INSTRUMENT_LAYOUT_SCHEDULING 2107 if (!m_doc->ownerElement()) 2108 printf("external script finished execution at %d\n", m_doc->elapsedTime()); 2109 #endif 2110 } else if (m_hasScriptsWaitingForStylesheets) { 2111 // m_hasScriptsWaitingForStylesheets flag might have changed during the script execution. 2112 // If it did we are now blocked waiting for stylesheets and should not execute more scripts until they arrive. 2113 finished = true; 2114 } 2115 2116 // 'm_requestingScript' is true when we are called synchronously from 2117 // scriptHandler(). In that case scriptHandler() will take care 2118 // of m_pendingSrc. 2119 if (!m_requestingScript) { 2120 SegmentedString rest = m_pendingSrc; 2121 m_pendingSrc.clear(); 2122 write(rest, false); 2123 // we might be deleted at this point, do not access any members. 2124 } 2125 } 2126 } 2127 2128 void HTMLTokenizer::executeExternalScriptsTimerFired(Timer<HTMLTokenizer>*) 2129 { 2130 if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) { 2131 // Restart the timer and do layout first. 2132 m_externalScriptsTimer.startOneShot(0); 2133 return; 2134 } 2135 2136 // Continue executing external scripts. 2137 executeExternalScriptsIfReady(); 2138 } 2139 2140 bool HTMLTokenizer::continueExecutingExternalScripts(double startTime) 2141 { 2142 if (m_externalScriptsTimer.isActive()) 2143 return false; 2144 2145 if (currentTime() - startTime > m_tokenizerTimeDelay) { 2146 // Schedule the timer to keep processing as soon as possible. 2147 m_externalScriptsTimer.startOneShot(0); 2148 return false; 2149 } 2150 return true; 2151 } 2152 2153 bool HTMLTokenizer::isWaitingForScripts() const 2154 { 2155 return m_state.loadingExtScript(); 2156 } 2157 2158 void HTMLTokenizer::setSrc(const SegmentedString& source) 2159 { 2160 m_src = source; 2161 } 2162 2163 void parseHTMLDocumentFragment(const String& source, DocumentFragment* fragment, FragmentScriptingPermission scriptingPermission) 2164 { 2165 HTMLTokenizer tok(fragment, scriptingPermission); 2166 tok.setForceSynchronous(true); 2167 tok.write(source, true); 2168 tok.finish(); 2169 ASSERT(!tok.processingData()); // make sure we're done (see 3963151) 2170 } 2171 2172 UChar decodeNamedEntity(const char* name) 2173 { 2174 const Entity* e = findEntity(name, strlen(name)); 2175 return e ? e->code : 0; 2176 } 2177 2178 } 2179