1 /* 2 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include "config.h" 27 #include "HTMLDocumentParser.h" 28 29 #include "ContentSecurityPolicy.h" 30 #include "DocumentFragment.h" 31 #include "Element.h" 32 #include "Frame.h" 33 #include "HTMLNames.h" 34 #include "HTMLParserScheduler.h" 35 #include "HTMLTokenizer.h" 36 #include "HTMLPreloadScanner.h" 37 #include "HTMLScriptRunner.h" 38 #include "HTMLTreeBuilder.h" 39 #include "HTMLDocument.h" 40 #include "InspectorInstrumentation.h" 41 #include "NestingLevelIncrementer.h" 42 #include "Settings.h" 43 44 namespace WebCore { 45 46 using namespace HTMLNames; 47 48 namespace { 49 50 // This is a direct transcription of step 4 from: 51 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#fragment-case 52 HTMLTokenizer::State tokenizerStateForContextElement(Element* contextElement, bool reportErrors) 53 { 54 if (!contextElement) 55 return HTMLTokenizer::DataState; 56 57 const QualifiedName& contextTag = contextElement->tagQName(); 58 59 if (contextTag.matches(titleTag) || contextTag.matches(textareaTag)) 60 return HTMLTokenizer::RCDATAState; 61 if (contextTag.matches(styleTag) 62 || contextTag.matches(xmpTag) 63 || contextTag.matches(iframeTag) 64 || (contextTag.matches(noembedTag) && HTMLTreeBuilder::pluginsEnabled(contextElement->document()->frame())) 65 || (contextTag.matches(noscriptTag) && HTMLTreeBuilder::scriptEnabled(contextElement->document()->frame())) 66 || contextTag.matches(noframesTag)) 67 return reportErrors ? HTMLTokenizer::RAWTEXTState : HTMLTokenizer::PLAINTEXTState; 68 if (contextTag.matches(scriptTag)) 69 return reportErrors ? HTMLTokenizer::ScriptDataState : HTMLTokenizer::PLAINTEXTState; 70 if (contextTag.matches(plaintextTag)) 71 return HTMLTokenizer::PLAINTEXTState; 72 return HTMLTokenizer::DataState; 73 } 74 75 } // namespace 76 77 HTMLDocumentParser::HTMLDocumentParser(HTMLDocument* document, bool reportErrors) 78 : ScriptableDocumentParser(document) 79 , m_tokenizer(HTMLTokenizer::create(usePreHTML5ParserQuirks(document))) 80 , m_scriptRunner(HTMLScriptRunner::create(document, this)) 81 , m_treeBuilder(HTMLTreeBuilder::create(this, document, reportErrors, usePreHTML5ParserQuirks(document))) 82 , m_parserScheduler(HTMLParserScheduler::create(this)) 83 , m_xssFilter(this) 84 , m_endWasDelayed(false) 85 , m_pumpSessionNestingLevel(0) 86 { 87 } 88 89 // FIXME: Member variables should be grouped into self-initializing structs to 90 // minimize code duplication between these constructors. 91 HTMLDocumentParser::HTMLDocumentParser(DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission) 92 : ScriptableDocumentParser(fragment->document()) 93 , m_tokenizer(HTMLTokenizer::create(usePreHTML5ParserQuirks(fragment->document()))) 94 , m_treeBuilder(HTMLTreeBuilder::create(this, fragment, contextElement, scriptingPermission, usePreHTML5ParserQuirks(fragment->document()))) 95 , m_xssFilter(this) 96 , m_endWasDelayed(false) 97 , m_pumpSessionNestingLevel(0) 98 { 99 bool reportErrors = false; // For now document fragment parsing never reports errors. 100 m_tokenizer->setState(tokenizerStateForContextElement(contextElement, reportErrors)); 101 } 102 103 HTMLDocumentParser::~HTMLDocumentParser() 104 { 105 ASSERT(!m_parserScheduler); 106 ASSERT(!m_pumpSessionNestingLevel); 107 ASSERT(!m_preloadScanner); 108 } 109 110 void HTMLDocumentParser::detach() 111 { 112 DocumentParser::detach(); 113 if (m_scriptRunner) 114 m_scriptRunner->detach(); 115 m_treeBuilder->detach(); 116 // FIXME: It seems wrong that we would have a preload scanner here. 117 // Yet during fast/dom/HTMLScriptElement/script-load-events.html we do. 118 m_preloadScanner.clear(); 119 m_parserScheduler.clear(); // Deleting the scheduler will clear any timers. 120 } 121 122 void HTMLDocumentParser::stopParsing() 123 { 124 DocumentParser::stopParsing(); 125 m_parserScheduler.clear(); // Deleting the scheduler will clear any timers. 126 } 127 128 // This kicks off "Once the user agent stops parsing" as described by: 129 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#the-end 130 void HTMLDocumentParser::prepareToStopParsing() 131 { 132 ASSERT(!hasInsertionPoint()); 133 134 // pumpTokenizer can cause this parser to be detached from the Document, 135 // but we need to ensure it isn't deleted yet. 136 RefPtr<HTMLDocumentParser> protect(this); 137 138 // NOTE: This pump should only ever emit buffered character tokens, 139 // so ForceSynchronous vs. AllowYield should be meaningless. 140 pumpTokenizerIfPossible(ForceSynchronous); 141 142 if (isStopped()) 143 return; 144 145 DocumentParser::prepareToStopParsing(); 146 147 // We will not have a scriptRunner when parsing a DocumentFragment. 148 if (m_scriptRunner) 149 document()->setReadyState(Document::Interactive); 150 151 attemptToRunDeferredScriptsAndEnd(); 152 } 153 154 bool HTMLDocumentParser::isParsingFragment() const 155 { 156 return m_treeBuilder->isParsingFragment(); 157 } 158 159 bool HTMLDocumentParser::processingData() const 160 { 161 return isScheduledForResume() || inPumpSession(); 162 } 163 164 void HTMLDocumentParser::pumpTokenizerIfPossible(SynchronousMode mode) 165 { 166 if (isStopped() || m_treeBuilder->isPaused()) 167 return; 168 169 // Once a resume is scheduled, HTMLParserScheduler controls when we next pump. 170 if (isScheduledForResume()) { 171 ASSERT(mode == AllowYield); 172 return; 173 } 174 175 pumpTokenizer(mode); 176 } 177 178 bool HTMLDocumentParser::isScheduledForResume() const 179 { 180 return m_parserScheduler && m_parserScheduler->isScheduledForResume(); 181 } 182 183 // Used by HTMLParserScheduler 184 void HTMLDocumentParser::resumeParsingAfterYield() 185 { 186 // pumpTokenizer can cause this parser to be detached from the Document, 187 // but we need to ensure it isn't deleted yet. 188 RefPtr<HTMLDocumentParser> protect(this); 189 190 // We should never be here unless we can pump immediately. Call pumpTokenizer() 191 // directly so that ASSERTS will fire if we're wrong. 192 pumpTokenizer(AllowYield); 193 endIfDelayed(); 194 } 195 196 bool HTMLDocumentParser::runScriptsForPausedTreeBuilder() 197 { 198 ASSERT(m_treeBuilder->isPaused()); 199 200 TextPosition1 scriptStartPosition = TextPosition1::belowRangePosition(); 201 RefPtr<Element> scriptElement = m_treeBuilder->takeScriptToProcess(scriptStartPosition); 202 // We will not have a scriptRunner when parsing a DocumentFragment. 203 if (!m_scriptRunner) 204 return true; 205 return m_scriptRunner->execute(scriptElement.release(), scriptStartPosition); 206 } 207 208 bool HTMLDocumentParser::canTakeNextToken(SynchronousMode mode, PumpSession& session) 209 { 210 if (isStopped()) 211 return false; 212 213 // The parser will pause itself when waiting on a script to load or run. 214 if (m_treeBuilder->isPaused()) { 215 if (mode == AllowYield) 216 m_parserScheduler->checkForYieldBeforeScript(session); 217 218 // If we don't run the script, we cannot allow the next token to be taken. 219 if (session.needsYield) 220 return false; 221 222 // If we're paused waiting for a script, we try to execute scripts before continuing. 223 bool shouldContinueParsing = runScriptsForPausedTreeBuilder(); 224 m_treeBuilder->setPaused(!shouldContinueParsing); 225 if (!shouldContinueParsing || isStopped()) 226 return false; 227 } 228 229 // FIXME: It's wrong for the HTMLDocumentParser to reach back to the 230 // Frame, but this approach is how the old parser handled 231 // stopping when the page assigns window.location. What really 232 // should happen is that assigning window.location causes the 233 // parser to stop parsing cleanly. The problem is we're not 234 // perpared to do that at every point where we run JavaScript. 235 if (!isParsingFragment() 236 && document()->frame() && document()->frame()->navigationScheduler()->locationChangePending()) 237 return false; 238 239 if (mode == AllowYield) 240 m_parserScheduler->checkForYieldBeforeToken(session); 241 242 return true; 243 } 244 245 void HTMLDocumentParser::pumpTokenizer(SynchronousMode mode) 246 { 247 ASSERT(!isStopped()); 248 ASSERT(!isScheduledForResume()); 249 // ASSERT that this object is both attached to the Document and protected. 250 ASSERT(refCount() >= 2); 251 252 PumpSession session(m_pumpSessionNestingLevel); 253 254 // We tell the InspectorInstrumentation about every pump, even if we 255 // end up pumping nothing. It can filter out empty pumps itself. 256 // FIXME: m_input.current().length() is only accurate if we 257 // end up parsing the whole buffer in this pump. We should pass how 258 // much we parsed as part of didWriteHTML instead of willWriteHTML. 259 InspectorInstrumentationCookie cookie = InspectorInstrumentation::willWriteHTML(document(), m_input.current().length(), m_tokenizer->lineNumber()); 260 261 while (canTakeNextToken(mode, session) && !session.needsYield) { 262 if (!isParsingFragment()) 263 m_sourceTracker.start(m_input, m_token); 264 265 if (!m_tokenizer->nextToken(m_input.current(), m_token)) 266 break; 267 268 if (!isParsingFragment()) { 269 m_sourceTracker.end(m_input, m_token); 270 271 // We do not XSS filter innerHTML, which means we (intentionally) fail 272 // http/tests/security/xssAuditor/dom-write-innerHTML.html 273 m_xssFilter.filterToken(m_token); 274 } 275 276 m_treeBuilder->constructTreeFromToken(m_token); 277 ASSERT(m_token.isUninitialized()); 278 } 279 280 // Ensure we haven't been totally deref'ed after pumping. Any caller of this 281 // function should be holding a RefPtr to this to ensure we weren't deleted. 282 ASSERT(refCount() >= 1); 283 284 if (isStopped()) 285 return; 286 287 if (session.needsYield) 288 m_parserScheduler->scheduleForResume(); 289 290 if (isWaitingForScripts()) { 291 ASSERT(m_tokenizer->state() == HTMLTokenizer::DataState); 292 if (!m_preloadScanner) { 293 m_preloadScanner.set(new HTMLPreloadScanner(document())); 294 m_preloadScanner->appendToEnd(m_input.current()); 295 } 296 m_preloadScanner->scan(); 297 } 298 299 InspectorInstrumentation::didWriteHTML(cookie, m_tokenizer->lineNumber()); 300 } 301 302 bool HTMLDocumentParser::hasInsertionPoint() 303 { 304 // FIXME: The wasCreatedByScript() branch here might not be fully correct. 305 // Our model of the EOF character differs slightly from the one in 306 // the spec because our treatment is uniform between network-sourced 307 // and script-sourced input streams whereas the spec treats them 308 // differently. 309 return m_input.hasInsertionPoint() || (wasCreatedByScript() && !m_input.haveSeenEndOfFile()); 310 } 311 312 void HTMLDocumentParser::insert(const SegmentedString& source) 313 { 314 if (isStopped()) 315 return; 316 317 // pumpTokenizer can cause this parser to be detached from the Document, 318 // but we need to ensure it isn't deleted yet. 319 RefPtr<HTMLDocumentParser> protect(this); 320 321 SegmentedString excludedLineNumberSource(source); 322 excludedLineNumberSource.setExcludeLineNumbers(); 323 m_input.insertAtCurrentInsertionPoint(excludedLineNumberSource); 324 pumpTokenizerIfPossible(ForceSynchronous); 325 326 if (isWaitingForScripts()) { 327 // Check the document.write() output with a separate preload scanner as 328 // the main scanner can't deal with insertions. 329 HTMLPreloadScanner preloadScanner(document()); 330 preloadScanner.appendToEnd(source); 331 preloadScanner.scan(); 332 } 333 334 endIfDelayed(); 335 } 336 337 void HTMLDocumentParser::append(const SegmentedString& source) 338 { 339 if (isStopped()) 340 return; 341 342 // pumpTokenizer can cause this parser to be detached from the Document, 343 // but we need to ensure it isn't deleted yet. 344 RefPtr<HTMLDocumentParser> protect(this); 345 346 if (m_preloadScanner) { 347 if (m_input.current().isEmpty() && !isWaitingForScripts()) { 348 // We have parsed until the end of the current input and so are now moving ahead of the preload scanner. 349 // Clear the scanner so we know to scan starting from the current input point if we block again. 350 m_preloadScanner.clear(); 351 } else { 352 m_preloadScanner->appendToEnd(source); 353 if (isWaitingForScripts()) 354 m_preloadScanner->scan(); 355 } 356 } 357 358 m_input.appendToEnd(source); 359 360 if (inPumpSession()) { 361 // We've gotten data off the network in a nested write. 362 // We don't want to consume any more of the input stream now. Do 363 // not worry. We'll consume this data in a less-nested write(). 364 return; 365 } 366 367 pumpTokenizerIfPossible(AllowYield); 368 369 endIfDelayed(); 370 } 371 372 void HTMLDocumentParser::end() 373 { 374 ASSERT(!isDetached()); 375 ASSERT(!isScheduledForResume()); 376 377 // Informs the the rest of WebCore that parsing is really finished (and deletes this). 378 m_treeBuilder->finished(); 379 } 380 381 void HTMLDocumentParser::attemptToRunDeferredScriptsAndEnd() 382 { 383 ASSERT(isStopping()); 384 ASSERT(!hasInsertionPoint()); 385 if (m_scriptRunner && !m_scriptRunner->executeScriptsWaitingForParsing()) 386 return; 387 end(); 388 } 389 390 void HTMLDocumentParser::attemptToEnd() 391 { 392 // finish() indicates we will not receive any more data. If we are waiting on 393 // an external script to load, we can't finish parsing quite yet. 394 395 if (shouldDelayEnd()) { 396 m_endWasDelayed = true; 397 return; 398 } 399 prepareToStopParsing(); 400 } 401 402 void HTMLDocumentParser::endIfDelayed() 403 { 404 // If we've already been detached, don't bother ending. 405 if (isDetached()) 406 return; 407 408 if (!m_endWasDelayed || shouldDelayEnd()) 409 return; 410 411 m_endWasDelayed = false; 412 prepareToStopParsing(); 413 } 414 415 void HTMLDocumentParser::finish() 416 { 417 // FIXME: We should ASSERT(!m_parserStopped) here, since it does not 418 // makes sense to call any methods on DocumentParser once it's been stopped. 419 // However, FrameLoader::stop calls Document::finishParsing unconditionally 420 // which in turn calls m_parser->finish(). 421 422 // We're not going to get any more data off the network, so we tell the 423 // input stream we've reached the end of file. finish() can be called more 424 // than once, if the first time does not call end(). 425 if (!m_input.haveSeenEndOfFile()) 426 m_input.markEndOfFile(); 427 attemptToEnd(); 428 } 429 430 bool HTMLDocumentParser::finishWasCalled() 431 { 432 return m_input.haveSeenEndOfFile(); 433 } 434 435 // This function is virtual and just for the DocumentParser interface. 436 bool HTMLDocumentParser::isExecutingScript() const 437 { 438 return inScriptExecution(); 439 } 440 441 // This function is non-virtual and used throughout the implementation. 442 bool HTMLDocumentParser::inScriptExecution() const 443 { 444 if (!m_scriptRunner) 445 return false; 446 return m_scriptRunner->isExecutingScript(); 447 } 448 449 String HTMLDocumentParser::sourceForToken(const HTMLToken& token) 450 { 451 return m_sourceTracker.sourceForToken(token); 452 } 453 454 int HTMLDocumentParser::lineNumber() const 455 { 456 return m_tokenizer->lineNumber(); 457 } 458 459 TextPosition0 HTMLDocumentParser::textPosition() const 460 { 461 const SegmentedString& currentString = m_input.current(); 462 WTF::ZeroBasedNumber line = currentString.currentLine(); 463 WTF::ZeroBasedNumber column = currentString.currentColumn(); 464 ASSERT(m_tokenizer->lineNumber() == line.zeroBasedInt()); 465 466 return TextPosition0(line, column); 467 } 468 469 bool HTMLDocumentParser::isWaitingForScripts() const 470 { 471 return m_treeBuilder->isPaused(); 472 } 473 474 void HTMLDocumentParser::resumeParsingAfterScriptExecution() 475 { 476 ASSERT(!inScriptExecution()); 477 ASSERT(!m_treeBuilder->isPaused()); 478 479 pumpTokenizerIfPossible(AllowYield); 480 endIfDelayed(); 481 } 482 483 void HTMLDocumentParser::watchForLoad(CachedResource* cachedScript) 484 { 485 ASSERT(!cachedScript->isLoaded()); 486 // addClient would call notifyFinished if the load were complete. 487 // Callers do not expect to be re-entered from this call, so they should 488 // not an already-loaded CachedResource. 489 cachedScript->addClient(this); 490 } 491 492 void HTMLDocumentParser::stopWatchingForLoad(CachedResource* cachedScript) 493 { 494 cachedScript->removeClient(this); 495 } 496 497 void HTMLDocumentParser::appendCurrentInputStreamToPreloadScannerAndScan() 498 { 499 ASSERT(m_preloadScanner); 500 m_preloadScanner->appendToEnd(m_input.current()); 501 m_preloadScanner->scan(); 502 } 503 504 void HTMLDocumentParser::notifyFinished(CachedResource* cachedResource) 505 { 506 // pumpTokenizer can cause this parser to be detached from the Document, 507 // but we need to ensure it isn't deleted yet. 508 RefPtr<HTMLDocumentParser> protect(this); 509 510 ASSERT(m_scriptRunner); 511 ASSERT(!inScriptExecution()); 512 if (isStopping()) { 513 attemptToRunDeferredScriptsAndEnd(); 514 return; 515 } 516 517 ASSERT(m_treeBuilder->isPaused()); 518 // Note: We only ever wait on one script at a time, so we always know this 519 // is the one we were waiting on and can un-pause the tree builder. 520 m_treeBuilder->setPaused(false); 521 bool shouldContinueParsing = m_scriptRunner->executeScriptsWaitingForLoad(cachedResource); 522 m_treeBuilder->setPaused(!shouldContinueParsing); 523 if (shouldContinueParsing) 524 resumeParsingAfterScriptExecution(); 525 } 526 527 void HTMLDocumentParser::executeScriptsWaitingForStylesheets() 528 { 529 // Document only calls this when the Document owns the DocumentParser 530 // so this will not be called in the DocumentFragment case. 531 ASSERT(m_scriptRunner); 532 // Ignore calls unless we have a script blocking the parser waiting on a 533 // stylesheet load. Otherwise we are currently parsing and this 534 // is a re-entrant call from encountering a </ style> tag. 535 if (!m_scriptRunner->hasScriptsWaitingForStylesheets()) 536 return; 537 538 // pumpTokenizer can cause this parser to be detached from the Document, 539 // but we need to ensure it isn't deleted yet. 540 RefPtr<HTMLDocumentParser> protect(this); 541 542 ASSERT(!m_scriptRunner->isExecutingScript()); 543 ASSERT(m_treeBuilder->isPaused()); 544 // Note: We only ever wait on one script at a time, so we always know this 545 // is the one we were waiting on and can un-pause the tree builder. 546 m_treeBuilder->setPaused(false); 547 bool shouldContinueParsing = m_scriptRunner->executeScriptsWaitingForStylesheets(); 548 m_treeBuilder->setPaused(!shouldContinueParsing); 549 if (shouldContinueParsing) 550 resumeParsingAfterScriptExecution(); 551 } 552 553 ScriptController* HTMLDocumentParser::script() const 554 { 555 return document()->frame() ? document()->frame()->script() : 0; 556 } 557 558 void HTMLDocumentParser::parseDocumentFragment(const String& source, DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission) 559 { 560 RefPtr<HTMLDocumentParser> parser = HTMLDocumentParser::create(fragment, contextElement, scriptingPermission); 561 parser->insert(source); // Use insert() so that the parser will not yield. 562 parser->finish(); 563 ASSERT(!parser->processingData()); // Make sure we're done. <rdar://problem/3963151> 564 parser->detach(); // Allows ~DocumentParser to assert it was detached before destruction. 565 } 566 567 bool HTMLDocumentParser::usePreHTML5ParserQuirks(Document* document) 568 { 569 ASSERT(document); 570 return document->settings() && document->settings()->usePreHTML5ParserQuirks(); 571 } 572 573 void HTMLDocumentParser::suspendScheduledTasks() 574 { 575 if (m_parserScheduler) 576 m_parserScheduler->suspend(); 577 } 578 579 void HTMLDocumentParser::resumeScheduledTasks() 580 { 581 if (m_parserScheduler) 582 m_parserScheduler->resume(); 583 } 584 585 } 586