1 /* 2 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include "config.h" 27 #include "HTMLDocumentParser.h" 28 29 #include "ContentSecurityPolicy.h" 30 #include "DocumentFragment.h" 31 #include "Element.h" 32 #include "Frame.h" 33 #include "HTMLNames.h" 34 #include "HTMLParserScheduler.h" 35 #include "HTMLTokenizer.h" 36 #include "HTMLPreloadScanner.h" 37 #include "HTMLScriptRunner.h" 38 #include "HTMLTreeBuilder.h" 39 #include "HTMLDocument.h" 40 #include "InspectorInstrumentation.h" 41 #include "NestingLevelIncrementer.h" 42 #include "Settings.h" 43 44 #ifdef ANDROID_INSTRUMENT 45 #include "TimeCounter.h" 46 #endif 47 48 namespace WebCore { 49 50 using namespace HTMLNames; 51 52 namespace { 53 54 // This is a direct transcription of step 4 from: 55 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#fragment-case 56 HTMLTokenizer::State tokenizerStateForContextElement(Element* contextElement, bool reportErrors) 57 { 58 if (!contextElement) 59 return HTMLTokenizer::DataState; 60 61 const QualifiedName& contextTag = contextElement->tagQName(); 62 63 if (contextTag.matches(titleTag) || contextTag.matches(textareaTag)) 64 return HTMLTokenizer::RCDATAState; 65 if (contextTag.matches(styleTag) 66 || contextTag.matches(xmpTag) 67 || contextTag.matches(iframeTag) 68 || (contextTag.matches(noembedTag) && HTMLTreeBuilder::pluginsEnabled(contextElement->document()->frame())) 69 || (contextTag.matches(noscriptTag) && HTMLTreeBuilder::scriptEnabled(contextElement->document()->frame())) 70 || contextTag.matches(noframesTag)) 71 return reportErrors ? HTMLTokenizer::RAWTEXTState : HTMLTokenizer::PLAINTEXTState; 72 if (contextTag.matches(scriptTag)) 73 return reportErrors ? HTMLTokenizer::ScriptDataState : HTMLTokenizer::PLAINTEXTState; 74 if (contextTag.matches(plaintextTag)) 75 return HTMLTokenizer::PLAINTEXTState; 76 return HTMLTokenizer::DataState; 77 } 78 79 } // namespace 80 81 HTMLDocumentParser::HTMLDocumentParser(HTMLDocument* document, bool reportErrors) 82 : ScriptableDocumentParser(document) 83 , m_tokenizer(HTMLTokenizer::create(usePreHTML5ParserQuirks(document))) 84 , m_scriptRunner(HTMLScriptRunner::create(document, this)) 85 , m_treeBuilder(HTMLTreeBuilder::create(this, document, reportErrors, usePreHTML5ParserQuirks(document))) 86 , m_parserScheduler(HTMLParserScheduler::create(this)) 87 , m_xssFilter(this) 88 , m_endWasDelayed(false) 89 , m_pumpSessionNestingLevel(0) 90 { 91 } 92 93 // FIXME: Member variables should be grouped into self-initializing structs to 94 // minimize code duplication between these constructors. 95 HTMLDocumentParser::HTMLDocumentParser(DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission) 96 : ScriptableDocumentParser(fragment->document()) 97 , m_tokenizer(HTMLTokenizer::create(usePreHTML5ParserQuirks(fragment->document()))) 98 , m_treeBuilder(HTMLTreeBuilder::create(this, fragment, contextElement, scriptingPermission, usePreHTML5ParserQuirks(fragment->document()))) 99 , m_xssFilter(this) 100 , m_endWasDelayed(false) 101 , m_pumpSessionNestingLevel(0) 102 { 103 bool reportErrors = false; // For now document fragment parsing never reports errors. 104 m_tokenizer->setState(tokenizerStateForContextElement(contextElement, reportErrors)); 105 } 106 107 HTMLDocumentParser::~HTMLDocumentParser() 108 { 109 ASSERT(!m_parserScheduler); 110 ASSERT(!m_pumpSessionNestingLevel); 111 ASSERT(!m_preloadScanner); 112 } 113 114 void HTMLDocumentParser::detach() 115 { 116 DocumentParser::detach(); 117 if (m_scriptRunner) 118 m_scriptRunner->detach(); 119 m_treeBuilder->detach(); 120 // FIXME: It seems wrong that we would have a preload scanner here. 121 // Yet during fast/dom/HTMLScriptElement/script-load-events.html we do. 122 m_preloadScanner.clear(); 123 m_parserScheduler.clear(); // Deleting the scheduler will clear any timers. 124 } 125 126 void HTMLDocumentParser::stopParsing() 127 { 128 DocumentParser::stopParsing(); 129 m_parserScheduler.clear(); // Deleting the scheduler will clear any timers. 130 } 131 132 // This kicks off "Once the user agent stops parsing" as described by: 133 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#the-end 134 void HTMLDocumentParser::prepareToStopParsing() 135 { 136 ASSERT(!hasInsertionPoint()); 137 138 // pumpTokenizer can cause this parser to be detached from the Document, 139 // but we need to ensure it isn't deleted yet. 140 RefPtr<HTMLDocumentParser> protect(this); 141 142 // NOTE: This pump should only ever emit buffered character tokens, 143 // so ForceSynchronous vs. AllowYield should be meaningless. 144 pumpTokenizerIfPossible(ForceSynchronous); 145 146 if (isStopped()) 147 return; 148 149 DocumentParser::prepareToStopParsing(); 150 151 // We will not have a scriptRunner when parsing a DocumentFragment. 152 if (m_scriptRunner) 153 document()->setReadyState(Document::Interactive); 154 155 attemptToRunDeferredScriptsAndEnd(); 156 } 157 158 bool HTMLDocumentParser::isParsingFragment() const 159 { 160 return m_treeBuilder->isParsingFragment(); 161 } 162 163 bool HTMLDocumentParser::processingData() const 164 { 165 return isScheduledForResume() || inPumpSession(); 166 } 167 168 void HTMLDocumentParser::pumpTokenizerIfPossible(SynchronousMode mode) 169 { 170 if (isStopped() || m_treeBuilder->isPaused()) 171 return; 172 173 // Once a resume is scheduled, HTMLParserScheduler controls when we next pump. 174 if (isScheduledForResume()) { 175 ASSERT(mode == AllowYield); 176 return; 177 } 178 179 pumpTokenizer(mode); 180 } 181 182 bool HTMLDocumentParser::isScheduledForResume() const 183 { 184 return m_parserScheduler && m_parserScheduler->isScheduledForResume(); 185 } 186 187 // Used by HTMLParserScheduler 188 void HTMLDocumentParser::resumeParsingAfterYield() 189 { 190 // pumpTokenizer can cause this parser to be detached from the Document, 191 // but we need to ensure it isn't deleted yet. 192 RefPtr<HTMLDocumentParser> protect(this); 193 194 // We should never be here unless we can pump immediately. Call pumpTokenizer() 195 // directly so that ASSERTS will fire if we're wrong. 196 pumpTokenizer(AllowYield); 197 endIfDelayed(); 198 } 199 200 bool HTMLDocumentParser::runScriptsForPausedTreeBuilder() 201 { 202 ASSERT(m_treeBuilder->isPaused()); 203 204 TextPosition1 scriptStartPosition = TextPosition1::belowRangePosition(); 205 RefPtr<Element> scriptElement = m_treeBuilder->takeScriptToProcess(scriptStartPosition); 206 // We will not have a scriptRunner when parsing a DocumentFragment. 207 if (!m_scriptRunner) 208 return true; 209 return m_scriptRunner->execute(scriptElement.release(), scriptStartPosition); 210 } 211 212 bool HTMLDocumentParser::canTakeNextToken(SynchronousMode mode, PumpSession& session) 213 { 214 if (isStopped()) 215 return false; 216 217 // The parser will pause itself when waiting on a script to load or run. 218 if (m_treeBuilder->isPaused()) { 219 if (mode == AllowYield) 220 m_parserScheduler->checkForYieldBeforeScript(session); 221 222 // If we don't run the script, we cannot allow the next token to be taken. 223 if (session.needsYield) 224 return false; 225 226 // If we're paused waiting for a script, we try to execute scripts before continuing. 227 bool shouldContinueParsing = runScriptsForPausedTreeBuilder(); 228 m_treeBuilder->setPaused(!shouldContinueParsing); 229 if (!shouldContinueParsing || isStopped()) 230 return false; 231 } 232 233 // FIXME: It's wrong for the HTMLDocumentParser to reach back to the 234 // Frame, but this approach is how the old parser handled 235 // stopping when the page assigns window.location. What really 236 // should happen is that assigning window.location causes the 237 // parser to stop parsing cleanly. The problem is we're not 238 // perpared to do that at every point where we run JavaScript. 239 if (!isParsingFragment() 240 && document()->frame() && document()->frame()->navigationScheduler()->locationChangePending()) 241 return false; 242 243 if (mode == AllowYield) 244 m_parserScheduler->checkForYieldBeforeToken(session); 245 246 return true; 247 } 248 249 void HTMLDocumentParser::pumpTokenizer(SynchronousMode mode) 250 { 251 ASSERT(!isStopped()); 252 ASSERT(!isScheduledForResume()); 253 // ASSERT that this object is both attached to the Document and protected. 254 ASSERT(refCount() >= 2); 255 256 PumpSession session(m_pumpSessionNestingLevel); 257 258 // We tell the InspectorInstrumentation about every pump, even if we 259 // end up pumping nothing. It can filter out empty pumps itself. 260 // FIXME: m_input.current().length() is only accurate if we 261 // end up parsing the whole buffer in this pump. We should pass how 262 // much we parsed as part of didWriteHTML instead of willWriteHTML. 263 InspectorInstrumentationCookie cookie = InspectorInstrumentation::willWriteHTML(document(), m_input.current().length(), m_tokenizer->lineNumber()); 264 265 while (canTakeNextToken(mode, session) && !session.needsYield) { 266 if (!isParsingFragment()) 267 m_sourceTracker.start(m_input, m_token); 268 269 if (!m_tokenizer->nextToken(m_input.current(), m_token)) 270 break; 271 272 if (!isParsingFragment()) { 273 m_sourceTracker.end(m_input, m_token); 274 275 // We do not XSS filter innerHTML, which means we (intentionally) fail 276 // http/tests/security/xssAuditor/dom-write-innerHTML.html 277 m_xssFilter.filterToken(m_token); 278 } 279 280 m_treeBuilder->constructTreeFromToken(m_token); 281 ASSERT(m_token.isUninitialized()); 282 } 283 284 // Ensure we haven't been totally deref'ed after pumping. Any caller of this 285 // function should be holding a RefPtr to this to ensure we weren't deleted. 286 ASSERT(refCount() >= 1); 287 288 if (isStopped()) 289 return; 290 291 if (session.needsYield) 292 m_parserScheduler->scheduleForResume(); 293 294 if (isWaitingForScripts()) { 295 ASSERT(m_tokenizer->state() == HTMLTokenizer::DataState); 296 if (!m_preloadScanner) { 297 m_preloadScanner.set(new HTMLPreloadScanner(document())); 298 m_preloadScanner->appendToEnd(m_input.current()); 299 } 300 m_preloadScanner->scan(); 301 } 302 303 InspectorInstrumentation::didWriteHTML(cookie, m_tokenizer->lineNumber()); 304 } 305 306 bool HTMLDocumentParser::hasInsertionPoint() 307 { 308 // FIXME: The wasCreatedByScript() branch here might not be fully correct. 309 // Our model of the EOF character differs slightly from the one in 310 // the spec because our treatment is uniform between network-sourced 311 // and script-sourced input streams whereas the spec treats them 312 // differently. 313 return m_input.hasInsertionPoint() || (wasCreatedByScript() && !m_input.haveSeenEndOfFile()); 314 } 315 316 void HTMLDocumentParser::insert(const SegmentedString& source) 317 { 318 if (isStopped()) 319 return; 320 321 #ifdef ANDROID_INSTRUMENT 322 android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter); 323 #endif 324 325 // pumpTokenizer can cause this parser to be detached from the Document, 326 // but we need to ensure it isn't deleted yet. 327 RefPtr<HTMLDocumentParser> protect(this); 328 329 SegmentedString excludedLineNumberSource(source); 330 excludedLineNumberSource.setExcludeLineNumbers(); 331 m_input.insertAtCurrentInsertionPoint(excludedLineNumberSource); 332 pumpTokenizerIfPossible(ForceSynchronous); 333 334 if (isWaitingForScripts()) { 335 // Check the document.write() output with a separate preload scanner as 336 // the main scanner can't deal with insertions. 337 HTMLPreloadScanner preloadScanner(document()); 338 preloadScanner.appendToEnd(source); 339 preloadScanner.scan(); 340 } 341 342 endIfDelayed(); 343 } 344 345 void HTMLDocumentParser::append(const SegmentedString& source) 346 { 347 if (isStopped()) 348 return; 349 350 // pumpTokenizer can cause this parser to be detached from the Document, 351 // but we need to ensure it isn't deleted yet. 352 RefPtr<HTMLDocumentParser> protect(this); 353 354 if (m_preloadScanner) { 355 if (m_input.current().isEmpty() && !isWaitingForScripts()) { 356 // We have parsed until the end of the current input and so are now moving ahead of the preload scanner. 357 // Clear the scanner so we know to scan starting from the current input point if we block again. 358 m_preloadScanner.clear(); 359 } else { 360 m_preloadScanner->appendToEnd(source); 361 if (isWaitingForScripts()) 362 m_preloadScanner->scan(); 363 } 364 } 365 366 m_input.appendToEnd(source); 367 368 if (inPumpSession()) { 369 // We've gotten data off the network in a nested write. 370 // We don't want to consume any more of the input stream now. Do 371 // not worry. We'll consume this data in a less-nested write(). 372 #ifdef ANDROID_INSTRUMENT 373 android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__); 374 #endif 375 return; 376 } 377 378 pumpTokenizerIfPossible(AllowYield); 379 380 endIfDelayed(); 381 #ifdef ANDROID_INSTRUMENT 382 android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__); 383 #endif 384 } 385 386 void HTMLDocumentParser::end() 387 { 388 ASSERT(!isDetached()); 389 ASSERT(!isScheduledForResume()); 390 391 // Informs the the rest of WebCore that parsing is really finished (and deletes this). 392 m_treeBuilder->finished(); 393 } 394 395 void HTMLDocumentParser::attemptToRunDeferredScriptsAndEnd() 396 { 397 ASSERT(isStopping()); 398 ASSERT(!hasInsertionPoint()); 399 if (m_scriptRunner && !m_scriptRunner->executeScriptsWaitingForParsing()) 400 return; 401 end(); 402 } 403 404 void HTMLDocumentParser::attemptToEnd() 405 { 406 // finish() indicates we will not receive any more data. If we are waiting on 407 // an external script to load, we can't finish parsing quite yet. 408 409 if (shouldDelayEnd()) { 410 m_endWasDelayed = true; 411 return; 412 } 413 prepareToStopParsing(); 414 } 415 416 void HTMLDocumentParser::endIfDelayed() 417 { 418 // If we've already been detached, don't bother ending. 419 if (isDetached()) 420 return; 421 422 if (!m_endWasDelayed || shouldDelayEnd()) 423 return; 424 425 m_endWasDelayed = false; 426 prepareToStopParsing(); 427 } 428 429 void HTMLDocumentParser::finish() 430 { 431 // FIXME: We should ASSERT(!m_parserStopped) here, since it does not 432 // makes sense to call any methods on DocumentParser once it's been stopped. 433 // However, FrameLoader::stop calls Document::finishParsing unconditionally 434 // which in turn calls m_parser->finish(). 435 436 // We're not going to get any more data off the network, so we tell the 437 // input stream we've reached the end of file. finish() can be called more 438 // than once, if the first time does not call end(). 439 if (!m_input.haveSeenEndOfFile()) 440 m_input.markEndOfFile(); 441 attemptToEnd(); 442 } 443 444 bool HTMLDocumentParser::finishWasCalled() 445 { 446 return m_input.haveSeenEndOfFile(); 447 } 448 449 // This function is virtual and just for the DocumentParser interface. 450 bool HTMLDocumentParser::isExecutingScript() const 451 { 452 return inScriptExecution(); 453 } 454 455 // This function is non-virtual and used throughout the implementation. 456 bool HTMLDocumentParser::inScriptExecution() const 457 { 458 if (!m_scriptRunner) 459 return false; 460 return m_scriptRunner->isExecutingScript(); 461 } 462 463 String HTMLDocumentParser::sourceForToken(const HTMLToken& token) 464 { 465 return m_sourceTracker.sourceForToken(token); 466 } 467 468 int HTMLDocumentParser::lineNumber() const 469 { 470 return m_tokenizer->lineNumber(); 471 } 472 473 TextPosition0 HTMLDocumentParser::textPosition() const 474 { 475 const SegmentedString& currentString = m_input.current(); 476 WTF::ZeroBasedNumber line = currentString.currentLine(); 477 WTF::ZeroBasedNumber column = currentString.currentColumn(); 478 ASSERT(m_tokenizer->lineNumber() == line.zeroBasedInt()); 479 480 return TextPosition0(line, column); 481 } 482 483 bool HTMLDocumentParser::isWaitingForScripts() const 484 { 485 return m_treeBuilder->isPaused(); 486 } 487 488 void HTMLDocumentParser::resumeParsingAfterScriptExecution() 489 { 490 ASSERT(!inScriptExecution()); 491 ASSERT(!m_treeBuilder->isPaused()); 492 493 pumpTokenizerIfPossible(AllowYield); 494 endIfDelayed(); 495 } 496 497 void HTMLDocumentParser::watchForLoad(CachedResource* cachedScript) 498 { 499 ASSERT(!cachedScript->isLoaded()); 500 // addClient would call notifyFinished if the load were complete. 501 // Callers do not expect to be re-entered from this call, so they should 502 // not an already-loaded CachedResource. 503 cachedScript->addClient(this); 504 } 505 506 void HTMLDocumentParser::stopWatchingForLoad(CachedResource* cachedScript) 507 { 508 cachedScript->removeClient(this); 509 } 510 511 void HTMLDocumentParser::appendCurrentInputStreamToPreloadScannerAndScan() 512 { 513 ASSERT(m_preloadScanner); 514 m_preloadScanner->appendToEnd(m_input.current()); 515 m_preloadScanner->scan(); 516 } 517 518 void HTMLDocumentParser::notifyFinished(CachedResource* cachedResource) 519 { 520 // pumpTokenizer can cause this parser to be detached from the Document, 521 // but we need to ensure it isn't deleted yet. 522 RefPtr<HTMLDocumentParser> protect(this); 523 524 ASSERT(m_scriptRunner); 525 ASSERT(!inScriptExecution()); 526 if (isStopping()) { 527 attemptToRunDeferredScriptsAndEnd(); 528 return; 529 } 530 531 ASSERT(m_treeBuilder->isPaused()); 532 // Note: We only ever wait on one script at a time, so we always know this 533 // is the one we were waiting on and can un-pause the tree builder. 534 m_treeBuilder->setPaused(false); 535 bool shouldContinueParsing = m_scriptRunner->executeScriptsWaitingForLoad(cachedResource); 536 m_treeBuilder->setPaused(!shouldContinueParsing); 537 if (shouldContinueParsing) 538 resumeParsingAfterScriptExecution(); 539 } 540 541 void HTMLDocumentParser::executeScriptsWaitingForStylesheets() 542 { 543 // Document only calls this when the Document owns the DocumentParser 544 // so this will not be called in the DocumentFragment case. 545 ASSERT(m_scriptRunner); 546 // Ignore calls unless we have a script blocking the parser waiting on a 547 // stylesheet load. Otherwise we are currently parsing and this 548 // is a re-entrant call from encountering a </ style> tag. 549 if (!m_scriptRunner->hasScriptsWaitingForStylesheets()) 550 return; 551 552 // pumpTokenizer can cause this parser to be detached from the Document, 553 // but we need to ensure it isn't deleted yet. 554 RefPtr<HTMLDocumentParser> protect(this); 555 556 ASSERT(!m_scriptRunner->isExecutingScript()); 557 ASSERT(m_treeBuilder->isPaused()); 558 // Note: We only ever wait on one script at a time, so we always know this 559 // is the one we were waiting on and can un-pause the tree builder. 560 m_treeBuilder->setPaused(false); 561 bool shouldContinueParsing = m_scriptRunner->executeScriptsWaitingForStylesheets(); 562 m_treeBuilder->setPaused(!shouldContinueParsing); 563 if (shouldContinueParsing) 564 resumeParsingAfterScriptExecution(); 565 } 566 567 ScriptController* HTMLDocumentParser::script() const 568 { 569 return document()->frame() ? document()->frame()->script() : 0; 570 } 571 572 void HTMLDocumentParser::parseDocumentFragment(const String& source, DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission) 573 { 574 RefPtr<HTMLDocumentParser> parser = HTMLDocumentParser::create(fragment, contextElement, scriptingPermission); 575 parser->insert(source); // Use insert() so that the parser will not yield. 576 parser->finish(); 577 ASSERT(!parser->processingData()); // Make sure we're done. <rdar://problem/3963151> 578 parser->detach(); // Allows ~DocumentParser to assert it was detached before destruction. 579 } 580 581 bool HTMLDocumentParser::usePreHTML5ParserQuirks(Document* document) 582 { 583 ASSERT(document); 584 return document->settings() && document->settings()->usePreHTML5ParserQuirks(); 585 } 586 587 void HTMLDocumentParser::suspendScheduledTasks() 588 { 589 if (m_parserScheduler) 590 m_parserScheduler->suspend(); 591 } 592 593 void HTMLDocumentParser::resumeScheduledTasks() 594 { 595 if (m_parserScheduler) 596 m_parserScheduler->resume(); 597 } 598 599 } 600