Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #include "config.h"
     27 #include "core/html/parser/HTMLDocumentParser.h"
     28 
     29 #include "HTMLNames.h"
     30 #include "core/dom/DocumentFragment.h"
     31 #include "core/dom/Element.h"
     32 #include "core/html/parser/AtomicHTMLToken.h"
     33 #include "core/html/parser/BackgroundHTMLParser.h"
     34 #include "core/html/parser/CompactHTMLToken.h"
     35 #include "core/html/parser/HTMLIdentifier.h"
     36 #include "core/html/parser/HTMLParserScheduler.h"
     37 #include "core/html/parser/HTMLParserThread.h"
     38 #include "core/html/parser/HTMLPreloadScanner.h"
     39 #include "core/html/parser/HTMLScriptRunner.h"
     40 #include "core/html/parser/HTMLTokenizer.h"
     41 #include "core/html/parser/HTMLTreeBuilder.h"
     42 #include "core/inspector/InspectorInstrumentation.h"
     43 #include "core/page/Frame.h"
     44 #include "core/platform/chromium/TraceEvent.h"
     45 #include "wtf/Functional.h"
     46 
     47 namespace WebCore {
     48 
     49 using namespace HTMLNames;
     50 
     51 // This is a direct transcription of step 4 from:
     52 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#fragment-case
     53 static HTMLTokenizer::State tokenizerStateForContextElement(Element* contextElement, bool reportErrors, const HTMLParserOptions& options)
     54 {
     55     if (!contextElement)
     56         return HTMLTokenizer::DataState;
     57 
     58     const QualifiedName& contextTag = contextElement->tagQName();
     59 
     60     if (contextTag.matches(titleTag) || contextTag.matches(textareaTag))
     61         return HTMLTokenizer::RCDATAState;
     62     if (contextTag.matches(styleTag)
     63         || contextTag.matches(xmpTag)
     64         || contextTag.matches(iframeTag)
     65         || (contextTag.matches(noembedTag) && options.pluginsEnabled)
     66         || (contextTag.matches(noscriptTag) && options.scriptEnabled)
     67         || contextTag.matches(noframesTag))
     68         return reportErrors ? HTMLTokenizer::RAWTEXTState : HTMLTokenizer::PLAINTEXTState;
     69     if (contextTag.matches(scriptTag))
     70         return reportErrors ? HTMLTokenizer::ScriptDataState : HTMLTokenizer::PLAINTEXTState;
     71     if (contextTag.matches(plaintextTag))
     72         return HTMLTokenizer::PLAINTEXTState;
     73     return HTMLTokenizer::DataState;
     74 }
     75 
     76 HTMLDocumentParser::HTMLDocumentParser(Document* document, bool reportErrors)
     77     : ScriptableDocumentParser(document)
     78     , m_options(document)
     79     , m_token(m_options.useThreading ? nullptr : adoptPtr(new HTMLToken))
     80     , m_tokenizer(m_options.useThreading ? nullptr : HTMLTokenizer::create(m_options))
     81     , m_scriptRunner(HTMLScriptRunner::create(document, this))
     82     , m_treeBuilder(HTMLTreeBuilder::create(this, document, parserContentPolicy(), reportErrors, m_options))
     83     , m_parserScheduler(HTMLParserScheduler::create(this))
     84     , m_xssAuditorDelegate(document)
     85     , m_weakFactory(this)
     86     , m_preloader(adoptPtr(new HTMLResourcePreloader(document)))
     87     , m_isPinnedToMainThread(false)
     88     , m_endWasDelayed(false)
     89     , m_haveBackgroundParser(false)
     90     , m_pumpSessionNestingLevel(0)
     91 {
     92     ASSERT(shouldUseThreading() || (m_token && m_tokenizer));
     93 }
     94 
     95 // FIXME: Member variables should be grouped into self-initializing structs to
     96 // minimize code duplication between these constructors.
     97 HTMLDocumentParser::HTMLDocumentParser(DocumentFragment* fragment, Element* contextElement, ParserContentPolicy parserContentPolicy)
     98     : ScriptableDocumentParser(fragment->document(), parserContentPolicy)
     99     , m_options(fragment->document())
    100     , m_token(adoptPtr(new HTMLToken))
    101     , m_tokenizer(HTMLTokenizer::create(m_options))
    102     , m_treeBuilder(HTMLTreeBuilder::create(this, fragment, contextElement, this->parserContentPolicy(), m_options))
    103     , m_xssAuditorDelegate(fragment->document())
    104     , m_weakFactory(this)
    105     , m_isPinnedToMainThread(true)
    106     , m_endWasDelayed(false)
    107     , m_haveBackgroundParser(false)
    108     , m_pumpSessionNestingLevel(0)
    109 {
    110     ASSERT(!shouldUseThreading());
    111     bool reportErrors = false; // For now document fragment parsing never reports errors.
    112     m_tokenizer->setState(tokenizerStateForContextElement(contextElement, reportErrors, m_options));
    113     m_xssAuditor.initForFragment();
    114 }
    115 
    116 HTMLDocumentParser::~HTMLDocumentParser()
    117 {
    118     ASSERT(!m_parserScheduler);
    119     ASSERT(!m_pumpSessionNestingLevel);
    120     ASSERT(!m_preloadScanner);
    121     ASSERT(!m_insertionPreloadScanner);
    122     ASSERT(!m_haveBackgroundParser);
    123     // FIXME: We should be able to ASSERT(m_speculations.isEmpty()),
    124     // but there are cases where that's not true currently. For example,
    125     // we we're told to stop parsing before we've consumed all the input.
    126 }
    127 
    128 void HTMLDocumentParser::pinToMainThread()
    129 {
    130     ASSERT(!m_haveBackgroundParser);
    131     ASSERT(!m_isPinnedToMainThread);
    132     m_isPinnedToMainThread = true;
    133     if (!m_tokenizer) {
    134         ASSERT(!m_token);
    135         m_token = adoptPtr(new HTMLToken);
    136         m_tokenizer = HTMLTokenizer::create(m_options);
    137     }
    138 }
    139 
    140 void HTMLDocumentParser::detach()
    141 {
    142     if (m_haveBackgroundParser)
    143         stopBackgroundParser();
    144     DocumentParser::detach();
    145     if (m_scriptRunner)
    146         m_scriptRunner->detach();
    147     m_treeBuilder->detach();
    148     // FIXME: It seems wrong that we would have a preload scanner here.
    149     // Yet during fast/dom/HTMLScriptElement/script-load-events.html we do.
    150     m_preloadScanner.clear();
    151     m_insertionPreloadScanner.clear();
    152     m_parserScheduler.clear(); // Deleting the scheduler will clear any timers.
    153 }
    154 
    155 void HTMLDocumentParser::stopParsing()
    156 {
    157     DocumentParser::stopParsing();
    158     m_parserScheduler.clear(); // Deleting the scheduler will clear any timers.
    159     if (m_haveBackgroundParser)
    160         stopBackgroundParser();
    161 }
    162 
    163 // This kicks off "Once the user agent stops parsing" as described by:
    164 // http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#the-end
    165 void HTMLDocumentParser::prepareToStopParsing()
    166 {
    167     // FIXME: It may not be correct to disable this for the background parser.
    168     // That means hasInsertionPoint() may not be correct in some cases.
    169     ASSERT(!hasInsertionPoint() || m_haveBackgroundParser);
    170 
    171     // pumpTokenizer can cause this parser to be detached from the Document,
    172     // but we need to ensure it isn't deleted yet.
    173     RefPtr<HTMLDocumentParser> protect(this);
    174 
    175     // NOTE: This pump should only ever emit buffered character tokens,
    176     // so ForceSynchronous vs. AllowYield should be meaningless.
    177     if (m_tokenizer) {
    178         ASSERT(!m_haveBackgroundParser);
    179         pumpTokenizerIfPossible(ForceSynchronous);
    180     }
    181 
    182     if (isStopped())
    183         return;
    184 
    185     DocumentParser::prepareToStopParsing();
    186 
    187     // We will not have a scriptRunner when parsing a DocumentFragment.
    188     if (m_scriptRunner)
    189         document()->setReadyState(Document::Interactive);
    190 
    191     // Setting the ready state above can fire mutation event and detach us
    192     // from underneath. In that case, just bail out.
    193     if (isDetached())
    194         return;
    195 
    196     attemptToRunDeferredScriptsAndEnd();
    197 }
    198 
    199 bool HTMLDocumentParser::isParsingFragment() const
    200 {
    201     return m_treeBuilder->isParsingFragment();
    202 }
    203 
    204 bool HTMLDocumentParser::processingData() const
    205 {
    206     return isScheduledForResume() || inPumpSession() || m_haveBackgroundParser;
    207 }
    208 
    209 void HTMLDocumentParser::pumpTokenizerIfPossible(SynchronousMode mode)
    210 {
    211     if (isStopped())
    212         return;
    213     if (isWaitingForScripts())
    214         return;
    215 
    216     // Once a resume is scheduled, HTMLParserScheduler controls when we next pump.
    217     if (isScheduledForResume()) {
    218         ASSERT(mode == AllowYield);
    219         return;
    220     }
    221 
    222     pumpTokenizer(mode);
    223 }
    224 
    225 bool HTMLDocumentParser::isScheduledForResume() const
    226 {
    227     return m_parserScheduler && m_parserScheduler->isScheduledForResume();
    228 }
    229 
    230 // Used by HTMLParserScheduler
    231 void HTMLDocumentParser::resumeParsingAfterYield()
    232 {
    233     ASSERT(!m_isPinnedToMainThread);
    234     // pumpTokenizer can cause this parser to be detached from the Document,
    235     // but we need to ensure it isn't deleted yet.
    236     RefPtr<HTMLDocumentParser> protect(this);
    237 
    238     if (m_haveBackgroundParser) {
    239         pumpPendingSpeculations();
    240         return;
    241     }
    242 
    243     // We should never be here unless we can pump immediately.  Call pumpTokenizer()
    244     // directly so that ASSERTS will fire if we're wrong.
    245     pumpTokenizer(AllowYield);
    246     endIfDelayed();
    247 }
    248 
    249 void HTMLDocumentParser::runScriptsForPausedTreeBuilder()
    250 {
    251     ASSERT(scriptingContentIsAllowed(parserContentPolicy()));
    252 
    253     TextPosition scriptStartPosition = TextPosition::belowRangePosition();
    254     RefPtr<Element> scriptElement = m_treeBuilder->takeScriptToProcess(scriptStartPosition);
    255     // We will not have a scriptRunner when parsing a DocumentFragment.
    256     if (m_scriptRunner)
    257         m_scriptRunner->execute(scriptElement.release(), scriptStartPosition);
    258 }
    259 
    260 bool HTMLDocumentParser::canTakeNextToken(SynchronousMode mode, PumpSession& session)
    261 {
    262     if (isStopped())
    263         return false;
    264 
    265     ASSERT(!m_haveBackgroundParser || mode == ForceSynchronous);
    266 
    267     if (isWaitingForScripts()) {
    268         if (mode == AllowYield)
    269             m_parserScheduler->checkForYieldBeforeScript(session);
    270 
    271         // If we don't run the script, we cannot allow the next token to be taken.
    272         if (session.needsYield)
    273             return false;
    274 
    275         // If we're paused waiting for a script, we try to execute scripts before continuing.
    276         runScriptsForPausedTreeBuilder();
    277         if (isStopped())
    278             return false;
    279         if (isWaitingForScripts())
    280             return false;
    281     }
    282 
    283     // FIXME: It's wrong for the HTMLDocumentParser to reach back to the
    284     //        Frame, but this approach is how the old parser handled
    285     //        stopping when the page assigns window.location.  What really
    286     //        should happen is that assigning window.location causes the
    287     //        parser to stop parsing cleanly.  The problem is we're not
    288     //        perpared to do that at every point where we run JavaScript.
    289     if (!isParsingFragment()
    290         && document()->frame() && document()->frame()->navigationScheduler()->locationChangePending())
    291         return false;
    292 
    293     if (mode == AllowYield)
    294         m_parserScheduler->checkForYieldBeforeToken(session);
    295 
    296     return true;
    297 }
    298 
    299 void HTMLDocumentParser::didReceiveParsedChunkFromBackgroundParser(PassOwnPtr<ParsedChunk> chunk)
    300 {
    301     // alert(), runModalDialog, and the JavaScript Debugger all run nested event loops
    302     // which can cause this method to be re-entered. We detect re-entry using
    303     // hasActiveParser(), save the chunk as a speculation, and return.
    304     if (isWaitingForScripts() || !m_speculations.isEmpty() || document()->activeParserCount() > 0) {
    305         m_preloader->takeAndPreload(chunk->preloads);
    306         m_speculations.append(chunk);
    307         return;
    308     }
    309 
    310     // processParsedChunkFromBackgroundParser can cause this parser to be detached from the Document,
    311     // but we need to ensure it isn't deleted yet.
    312     RefPtr<HTMLDocumentParser> protect(this);
    313 
    314     ASSERT(m_speculations.isEmpty());
    315     chunk->preloads.clear(); // We don't need to preload because we're going to parse immediately.
    316     m_speculations.append(chunk);
    317     pumpPendingSpeculations();
    318 }
    319 
    320 void HTMLDocumentParser::validateSpeculations(PassOwnPtr<ParsedChunk> chunk)
    321 {
    322     ASSERT(chunk);
    323     if (isWaitingForScripts()) {
    324         // We're waiting on a network script, just save the chunk, we'll get
    325         // a second validateSpeculations call after the script completes.
    326         // This call should have been made immediately after runScriptsForPausedTreeBuilder
    327         // which may have started a network load and left us waiting.
    328         ASSERT(!m_lastChunkBeforeScript);
    329         m_lastChunkBeforeScript = chunk;
    330         return;
    331     }
    332 
    333     ASSERT(!m_lastChunkBeforeScript);
    334     OwnPtr<HTMLTokenizer> tokenizer = m_tokenizer.release();
    335     OwnPtr<HTMLToken> token = m_token.release();
    336 
    337     if (!tokenizer) {
    338         // There must not have been any changes to the HTMLTokenizer state on
    339         // the main thread, which means the speculation buffer is correct.
    340         return;
    341     }
    342 
    343     // Currently we're only smart enough to reuse the speculation buffer if the tokenizer
    344     // both starts and ends in the DataState. That state is simplest because the HTMLToken
    345     // is always in the Uninitialized state. We should consider whether we can reuse the
    346     // speculation buffer in other states, but we'd likely need to do something more
    347     // sophisticated with the HTMLToken.
    348     if (chunk->tokenizerState == HTMLTokenizer::DataState
    349         && tokenizer->state() == HTMLTokenizer::DataState
    350         && m_input.current().isEmpty()
    351         && chunk->treeBuilderState == HTMLTreeBuilderSimulator::stateFor(m_treeBuilder.get())) {
    352         ASSERT(token->isUninitialized());
    353         return;
    354     }
    355 
    356     discardSpeculationsAndResumeFrom(chunk, token.release(), tokenizer.release());
    357 }
    358 
    359 void HTMLDocumentParser::discardSpeculationsAndResumeFrom(PassOwnPtr<ParsedChunk> lastChunkBeforeScript, PassOwnPtr<HTMLToken> token, PassOwnPtr<HTMLTokenizer> tokenizer)
    360 {
    361     m_weakFactory.revokeAll();
    362     m_speculations.clear();
    363 
    364     OwnPtr<BackgroundHTMLParser::Checkpoint> checkpoint = adoptPtr(new BackgroundHTMLParser::Checkpoint);
    365     checkpoint->parser = m_weakFactory.createWeakPtr();
    366     checkpoint->token = token;
    367     checkpoint->tokenizer = tokenizer;
    368     checkpoint->treeBuilderState = HTMLTreeBuilderSimulator::stateFor(m_treeBuilder.get());
    369     checkpoint->inputCheckpoint = lastChunkBeforeScript->inputCheckpoint;
    370     checkpoint->preloadScannerCheckpoint = lastChunkBeforeScript->preloadScannerCheckpoint;
    371     checkpoint->unparsedInput = m_input.current().toString().isolatedCopy();
    372     m_input.current().clear(); // FIXME: This should be passed in instead of cleared.
    373 
    374     ASSERT(checkpoint->unparsedInput.isSafeToSendToAnotherThread());
    375     HTMLParserThread::shared()->postTask(bind(&BackgroundHTMLParser::resumeFrom, m_backgroundParser, checkpoint.release()));
    376 }
    377 
    378 void HTMLDocumentParser::processParsedChunkFromBackgroundParser(PassOwnPtr<ParsedChunk> popChunk)
    379 {
    380     ASSERT_WITH_SECURITY_IMPLICATION(!document()->activeParserCount());
    381     ASSERT(!isParsingFragment());
    382     ASSERT(!isWaitingForScripts());
    383     ASSERT(!isStopped());
    384     // ASSERT that this object is both attached to the Document and protected.
    385     ASSERT(refCount() >= 2);
    386     ASSERT(shouldUseThreading());
    387     ASSERT(!m_tokenizer);
    388     ASSERT(!m_token);
    389     ASSERT(!m_lastChunkBeforeScript);
    390 
    391     ActiveParserSession session(contextForParsingSession());
    392 
    393     OwnPtr<ParsedChunk> chunk(popChunk);
    394     OwnPtr<CompactHTMLTokenStream> tokens = chunk->tokens.release();
    395 
    396     HTMLParserThread::shared()->postTask(bind(&BackgroundHTMLParser::startedChunkWithCheckpoint, m_backgroundParser, chunk->inputCheckpoint));
    397 
    398     for (XSSInfoStream::const_iterator it = chunk->xssInfos.begin(); it != chunk->xssInfos.end(); ++it) {
    399         m_textPosition = (*it)->m_textPosition;
    400         m_xssAuditorDelegate.didBlockScript(**it);
    401         if (isStopped())
    402             break;
    403     }
    404 
    405     for (Vector<CompactHTMLToken>::const_iterator it = tokens->begin(); it != tokens->end(); ++it) {
    406         ASSERT(!isWaitingForScripts());
    407 
    408         if (!isParsingFragment()
    409             && document()->frame() && document()->frame()->navigationScheduler()->locationChangePending()) {
    410 
    411             // To match main-thread parser behavior (which never checks locationChangePending on the EOF path)
    412             // we peek to see if this chunk has an EOF and process it anyway.
    413             if (tokens->last().type() == HTMLToken::EndOfFile) {
    414                 ASSERT(m_speculations.isEmpty()); // There should never be any chunks after the EOF.
    415                 prepareToStopParsing();
    416             }
    417             break;
    418         }
    419 
    420         m_textPosition = it->textPosition();
    421 
    422         constructTreeFromCompactHTMLToken(*it);
    423 
    424         if (isStopped())
    425             break;
    426 
    427         if (isWaitingForScripts()) {
    428             ASSERT(it + 1 == tokens->end()); // The </script> is assumed to be the last token of this bunch.
    429             runScriptsForPausedTreeBuilder();
    430             validateSpeculations(chunk.release());
    431             break;
    432         }
    433 
    434         if (it->type() == HTMLToken::EndOfFile) {
    435             ASSERT(it + 1 == tokens->end()); // The EOF is assumed to be the last token of this bunch.
    436             ASSERT(m_speculations.isEmpty()); // There should never be any chunks after the EOF.
    437             prepareToStopParsing();
    438             break;
    439         }
    440 
    441         ASSERT(!m_tokenizer);
    442         ASSERT(!m_token);
    443     }
    444 }
    445 
    446 void HTMLDocumentParser::pumpPendingSpeculations()
    447 {
    448     // FIXME: Share this constant with the parser scheduler.
    449     const double parserTimeLimit = 0.500;
    450 
    451     // ASSERT that this object is both attached to the Document and protected.
    452     ASSERT(refCount() >= 2);
    453     // If this assert fails, you need to call validateSpeculations to make sure
    454     // m_tokenizer and m_token don't have state that invalidates m_speculations.
    455     ASSERT(!m_tokenizer);
    456     ASSERT(!m_token);
    457     ASSERT(!m_lastChunkBeforeScript);
    458     ASSERT(!isWaitingForScripts());
    459     ASSERT(!isStopped());
    460 
    461     // FIXME: Pass in current input length.
    462     InspectorInstrumentationCookie cookie = InspectorInstrumentation::willWriteHTML(document(), lineNumber().zeroBasedInt());
    463 
    464     double startTime = currentTime();
    465 
    466     while (!m_speculations.isEmpty()) {
    467         processParsedChunkFromBackgroundParser(m_speculations.takeFirst());
    468 
    469         // The order matters! If this isStopped(), isWaitingForScripts() can hit and ASSERT since
    470         // m_document can be null which is used to decide the readiness.
    471         if (isStopped())
    472             break;
    473         if (isWaitingForScripts())
    474             break;
    475 
    476         if (currentTime() - startTime > parserTimeLimit && !m_speculations.isEmpty()) {
    477             m_parserScheduler->scheduleForResume();
    478             break;
    479         }
    480     }
    481 
    482     InspectorInstrumentation::didWriteHTML(cookie, lineNumber().zeroBasedInt());
    483 }
    484 
    485 void HTMLDocumentParser::forcePlaintextForTextDocument()
    486 {
    487     if (shouldUseThreading()) {
    488         // This method is called before any data is appended, so we have to start
    489         // the background parser ourselves.
    490         if (!m_haveBackgroundParser)
    491             startBackgroundParser();
    492 
    493         HTMLParserThread::shared()->postTask(bind(&BackgroundHTMLParser::forcePlaintextForTextDocument, m_backgroundParser));
    494     } else
    495         m_tokenizer->setState(HTMLTokenizer::PLAINTEXTState);
    496 }
    497 
    498 Document* HTMLDocumentParser::contextForParsingSession()
    499 {
    500     // The parsing session should interact with the document only when parsing
    501     // non-fragments. Otherwise, we might delay the load event mistakenly.
    502     if (isParsingFragment())
    503         return 0;
    504     return document();
    505 }
    506 
    507 void HTMLDocumentParser::pumpTokenizer(SynchronousMode mode)
    508 {
    509     ASSERT(!isStopped());
    510     ASSERT(!isScheduledForResume());
    511     // ASSERT that this object is both attached to the Document and protected.
    512     ASSERT(refCount() >= 2);
    513     ASSERT(m_tokenizer);
    514     ASSERT(m_token);
    515     ASSERT(!m_haveBackgroundParser || mode == ForceSynchronous);
    516 
    517     PumpSession session(m_pumpSessionNestingLevel, contextForParsingSession());
    518 
    519     // We tell the InspectorInstrumentation about every pump, even if we
    520     // end up pumping nothing.  It can filter out empty pumps itself.
    521     // FIXME: m_input.current().length() is only accurate if we
    522     // end up parsing the whole buffer in this pump.  We should pass how
    523     // much we parsed as part of didWriteHTML instead of willWriteHTML.
    524     InspectorInstrumentationCookie cookie = InspectorInstrumentation::willWriteHTML(document(), m_input.current().currentLine().zeroBasedInt());
    525 
    526     m_xssAuditor.init(document(), &m_xssAuditorDelegate);
    527 
    528     while (canTakeNextToken(mode, session) && !session.needsYield) {
    529         if (!isParsingFragment())
    530             m_sourceTracker.start(m_input.current(), m_tokenizer.get(), token());
    531 
    532         if (!m_tokenizer->nextToken(m_input.current(), token()))
    533             break;
    534 
    535         if (!isParsingFragment()) {
    536             m_sourceTracker.end(m_input.current(), m_tokenizer.get(), token());
    537 
    538             // We do not XSS filter innerHTML, which means we (intentionally) fail
    539             // http/tests/security/xssAuditor/dom-write-innerHTML.html
    540             if (OwnPtr<XSSInfo> xssInfo = m_xssAuditor.filterToken(FilterTokenRequest(token(), m_sourceTracker, m_tokenizer->shouldAllowCDATA())))
    541                 m_xssAuditorDelegate.didBlockScript(*xssInfo);
    542         }
    543 
    544         constructTreeFromHTMLToken(token());
    545         ASSERT(token().isUninitialized());
    546     }
    547 
    548     // Ensure we haven't been totally deref'ed after pumping. Any caller of this
    549     // function should be holding a RefPtr to this to ensure we weren't deleted.
    550     ASSERT(refCount() >= 1);
    551 
    552     if (isStopped())
    553         return;
    554 
    555     if (session.needsYield)
    556         m_parserScheduler->scheduleForResume();
    557 
    558     if (isWaitingForScripts()) {
    559         ASSERT(m_tokenizer->state() == HTMLTokenizer::DataState);
    560         if (!m_preloadScanner) {
    561             m_preloadScanner = adoptPtr(new HTMLPreloadScanner(m_options, document()->url()));
    562             m_preloadScanner->appendToEnd(m_input.current());
    563         }
    564         m_preloadScanner->scan(m_preloader.get(), document()->baseElementURL());
    565     }
    566 
    567     InspectorInstrumentation::didWriteHTML(cookie, m_input.current().currentLine().zeroBasedInt());
    568 }
    569 
    570 void HTMLDocumentParser::constructTreeFromHTMLToken(HTMLToken& rawToken)
    571 {
    572     AtomicHTMLToken token(rawToken);
    573 
    574     // We clear the rawToken in case constructTreeFromAtomicToken
    575     // synchronously re-enters the parser. We don't clear the token immedately
    576     // for Character tokens because the AtomicHTMLToken avoids copying the
    577     // characters by keeping a pointer to the underlying buffer in the
    578     // HTMLToken. Fortunately, Character tokens can't cause us to re-enter
    579     // the parser.
    580     //
    581     // FIXME: Stop clearing the rawToken once we start running the parser off
    582     // the main thread or once we stop allowing synchronous JavaScript
    583     // execution from parseAttribute.
    584     if (rawToken.type() != HTMLToken::Character)
    585         rawToken.clear();
    586 
    587     m_treeBuilder->constructTree(&token);
    588 
    589     if (!rawToken.isUninitialized()) {
    590         ASSERT(rawToken.type() == HTMLToken::Character);
    591         rawToken.clear();
    592     }
    593 }
    594 
    595 void HTMLDocumentParser::constructTreeFromCompactHTMLToken(const CompactHTMLToken& compactToken)
    596 {
    597     AtomicHTMLToken token(compactToken);
    598     m_treeBuilder->constructTree(&token);
    599 }
    600 
    601 bool HTMLDocumentParser::hasInsertionPoint()
    602 {
    603     // FIXME: The wasCreatedByScript() branch here might not be fully correct.
    604     //        Our model of the EOF character differs slightly from the one in
    605     //        the spec because our treatment is uniform between network-sourced
    606     //        and script-sourced input streams whereas the spec treats them
    607     //        differently.
    608     return m_input.hasInsertionPoint() || (wasCreatedByScript() && !m_input.haveSeenEndOfFile());
    609 }
    610 
    611 void HTMLDocumentParser::insert(const SegmentedString& source)
    612 {
    613     if (isStopped())
    614         return;
    615 
    616     // pumpTokenizer can cause this parser to be detached from the Document,
    617     // but we need to ensure it isn't deleted yet.
    618     RefPtr<HTMLDocumentParser> protect(this);
    619 
    620     if (!m_tokenizer) {
    621         ASSERT(!inPumpSession());
    622         ASSERT(m_haveBackgroundParser || wasCreatedByScript());
    623         m_token = adoptPtr(new HTMLToken);
    624         m_tokenizer = HTMLTokenizer::create(m_options);
    625     }
    626 
    627     SegmentedString excludedLineNumberSource(source);
    628     excludedLineNumberSource.setExcludeLineNumbers();
    629     m_input.insertAtCurrentInsertionPoint(excludedLineNumberSource);
    630     pumpTokenizerIfPossible(ForceSynchronous);
    631 
    632     if (isWaitingForScripts()) {
    633         // Check the document.write() output with a separate preload scanner as
    634         // the main scanner can't deal with insertions.
    635         if (!m_insertionPreloadScanner)
    636             m_insertionPreloadScanner = adoptPtr(new HTMLPreloadScanner(m_options, document()->url()));
    637         m_insertionPreloadScanner->appendToEnd(source);
    638         m_insertionPreloadScanner->scan(m_preloader.get(), document()->baseElementURL());
    639     }
    640 
    641     endIfDelayed();
    642 }
    643 
    644 void HTMLDocumentParser::startBackgroundParser()
    645 {
    646     ASSERT(shouldUseThreading());
    647     ASSERT(!m_haveBackgroundParser);
    648     m_haveBackgroundParser = true;
    649 
    650     HTMLIdentifier::init();
    651 
    652     RefPtr<WeakReference<BackgroundHTMLParser> > reference = WeakReference<BackgroundHTMLParser>::createUnbound();
    653     m_backgroundParser = WeakPtr<BackgroundHTMLParser>(reference);
    654 
    655     OwnPtr<BackgroundHTMLParser::Configuration> config = adoptPtr(new BackgroundHTMLParser::Configuration);
    656     config->options = m_options;
    657     config->parser = m_weakFactory.createWeakPtr();
    658     config->xssAuditor = adoptPtr(new XSSAuditor);
    659     config->xssAuditor->init(document(), &m_xssAuditorDelegate);
    660     config->preloadScanner = adoptPtr(new TokenPreloadScanner(document()->url().copy()));
    661 
    662     ASSERT(config->xssAuditor->isSafeToSendToAnotherThread());
    663     ASSERT(config->preloadScanner->isSafeToSendToAnotherThread());
    664     HTMLParserThread::shared()->postTask(bind(&BackgroundHTMLParser::create, reference.release(), config.release()));
    665 }
    666 
    667 void HTMLDocumentParser::stopBackgroundParser()
    668 {
    669     ASSERT(shouldUseThreading());
    670     ASSERT(m_haveBackgroundParser);
    671     m_haveBackgroundParser = false;
    672 
    673     HTMLParserThread::shared()->postTask(bind(&BackgroundHTMLParser::stop, m_backgroundParser));
    674     m_weakFactory.revokeAll();
    675 }
    676 
    677 void HTMLDocumentParser::append(PassRefPtr<StringImpl> inputSource)
    678 {
    679     if (isStopped())
    680         return;
    681 
    682     if (shouldUseThreading()) {
    683         if (!m_haveBackgroundParser)
    684             startBackgroundParser();
    685 
    686         ASSERT(inputSource->hasOneRef());
    687         TRACE_EVENT1("net", "HTMLDocumentParser::append", "size", inputSource->length());
    688         // NOTE: Important that the String temporary is destroyed before we post the task
    689         // otherwise the String could call deref() on a StringImpl now owned by the background parser.
    690         // We would like to ASSERT(closure.arg3()->hasOneRef()) but sadly the args are private.
    691         Closure closure = bind(&BackgroundHTMLParser::append, m_backgroundParser, String(inputSource));
    692         HTMLParserThread::shared()->postTask(closure);
    693         return;
    694     }
    695 
    696     // pumpTokenizer can cause this parser to be detached from the Document,
    697     // but we need to ensure it isn't deleted yet.
    698     RefPtr<HTMLDocumentParser> protect(this);
    699     TRACE_EVENT1("net", "HTMLDocumentParser::append", "size", inputSource->length());
    700     String source(inputSource);
    701 
    702     if (m_preloadScanner) {
    703         if (m_input.current().isEmpty() && !isWaitingForScripts()) {
    704             // We have parsed until the end of the current input and so are now moving ahead of the preload scanner.
    705             // Clear the scanner so we know to scan starting from the current input point if we block again.
    706             m_preloadScanner.clear();
    707         } else {
    708             m_preloadScanner->appendToEnd(source);
    709             if (isWaitingForScripts())
    710                 m_preloadScanner->scan(m_preloader.get(), document()->baseElementURL());
    711         }
    712     }
    713 
    714     m_input.appendToEnd(source);
    715 
    716     if (inPumpSession()) {
    717         // We've gotten data off the network in a nested write.
    718         // We don't want to consume any more of the input stream now.  Do
    719         // not worry.  We'll consume this data in a less-nested write().
    720         return;
    721     }
    722 
    723     // A couple pinToMainThread() callers require synchronous parsing, but can't
    724     // easily use the insert() method, so we hack append() for them to be synchronous.
    725     // javascript: url handling is one such caller.
    726     // FIXME: This is gross, and we should separate the concept of synchronous parsing
    727     // from insert() so that only document.write() uses insert.
    728     if (m_isPinnedToMainThread)
    729         pumpTokenizerIfPossible(ForceSynchronous);
    730     else
    731         pumpTokenizerIfPossible(AllowYield);
    732 
    733     endIfDelayed();
    734 }
    735 
    736 void HTMLDocumentParser::end()
    737 {
    738     ASSERT(!isDetached());
    739     ASSERT(!isScheduledForResume());
    740 
    741     if (m_haveBackgroundParser)
    742         stopBackgroundParser();
    743 
    744     // Informs the the rest of WebCore that parsing is really finished (and deletes this).
    745     m_treeBuilder->finished();
    746 }
    747 
    748 void HTMLDocumentParser::attemptToRunDeferredScriptsAndEnd()
    749 {
    750     ASSERT(isStopping());
    751     // FIXME: It may not be correct to disable this for the background parser.
    752     // That means hasInsertionPoint() may not be correct in some cases.
    753     ASSERT(!hasInsertionPoint() || m_haveBackgroundParser);
    754     if (m_scriptRunner && !m_scriptRunner->executeScriptsWaitingForParsing())
    755         return;
    756     end();
    757 }
    758 
    759 void HTMLDocumentParser::attemptToEnd()
    760 {
    761     // finish() indicates we will not receive any more data. If we are waiting on
    762     // an external script to load, we can't finish parsing quite yet.
    763 
    764     if (shouldDelayEnd()) {
    765         m_endWasDelayed = true;
    766         return;
    767     }
    768     prepareToStopParsing();
    769 }
    770 
    771 void HTMLDocumentParser::endIfDelayed()
    772 {
    773     // If we've already been detached, don't bother ending.
    774     if (isDetached())
    775         return;
    776 
    777     if (!m_endWasDelayed || shouldDelayEnd())
    778         return;
    779 
    780     m_endWasDelayed = false;
    781     prepareToStopParsing();
    782 }
    783 
    784 void HTMLDocumentParser::finish()
    785 {
    786     // FIXME: We should ASSERT(!m_parserStopped) here, since it does not
    787     // makes sense to call any methods on DocumentParser once it's been stopped.
    788     // However, FrameLoader::stop calls DocumentParser::finish unconditionally.
    789 
    790     // Empty documents never got an append() call, and thus have never started
    791     // a background parser. In those cases, we ignore shouldUseThreading()
    792     // and fall through to the non-threading case.
    793     if (m_haveBackgroundParser) {
    794         if (!m_input.haveSeenEndOfFile())
    795             m_input.closeWithoutMarkingEndOfFile();
    796         HTMLParserThread::shared()->postTask(bind(&BackgroundHTMLParser::finish, m_backgroundParser));
    797         return;
    798     }
    799 
    800     if (!m_tokenizer) {
    801         ASSERT(!m_token);
    802         // We're finishing before receiving any data. Rather than booting up
    803         // the background parser just to spin it down, we finish parsing
    804         // synchronously.
    805         m_token = adoptPtr(new HTMLToken);
    806         m_tokenizer = HTMLTokenizer::create(m_options);
    807     }
    808 
    809     // We're not going to get any more data off the network, so we tell the
    810     // input stream we've reached the end of file. finish() can be called more
    811     // than once, if the first time does not call end().
    812     if (!m_input.haveSeenEndOfFile())
    813         m_input.markEndOfFile();
    814 
    815     attemptToEnd();
    816 }
    817 
    818 bool HTMLDocumentParser::isExecutingScript() const
    819 {
    820     if (!m_scriptRunner)
    821         return false;
    822     return m_scriptRunner->isExecutingScript();
    823 }
    824 
    825 OrdinalNumber HTMLDocumentParser::lineNumber() const
    826 {
    827     if (m_haveBackgroundParser)
    828         return m_textPosition.m_line;
    829 
    830     return m_input.current().currentLine();
    831 }
    832 
    833 TextPosition HTMLDocumentParser::textPosition() const
    834 {
    835     if (m_haveBackgroundParser)
    836         return m_textPosition;
    837 
    838     const SegmentedString& currentString = m_input.current();
    839     OrdinalNumber line = currentString.currentLine();
    840     OrdinalNumber column = currentString.currentColumn();
    841 
    842     return TextPosition(line, column);
    843 }
    844 
    845 bool HTMLDocumentParser::isWaitingForScripts() const
    846 {
    847     // When the TreeBuilder encounters a </script> tag, it returns to the HTMLDocumentParser
    848     // where the script is transfered from the treebuilder to the script runner.
    849     // The script runner will hold the script until its loaded and run. During
    850     // any of this time, we want to count ourselves as "waiting for a script" and thus
    851     // run the preload scanner, as well as delay completion of parsing.
    852     bool treeBuilderHasBlockingScript = m_treeBuilder->hasParserBlockingScript();
    853     bool scriptRunnerHasBlockingScript = m_scriptRunner && m_scriptRunner->hasParserBlockingScript();
    854     // Since the parser is paused while a script runner has a blocking script, it should
    855     // never be possible to end up with both objects holding a blocking script.
    856     ASSERT(!(treeBuilderHasBlockingScript && scriptRunnerHasBlockingScript));
    857     // If either object has a blocking script, the parser should be paused.
    858     return treeBuilderHasBlockingScript || scriptRunnerHasBlockingScript;
    859 }
    860 
    861 void HTMLDocumentParser::resumeParsingAfterScriptExecution()
    862 {
    863     ASSERT(!isExecutingScript());
    864     ASSERT(!isWaitingForScripts());
    865 
    866     if (m_haveBackgroundParser) {
    867         validateSpeculations(m_lastChunkBeforeScript.release());
    868         ASSERT(!m_lastChunkBeforeScript);
    869         // processParsedChunkFromBackgroundParser can cause this parser to be detached from the Document,
    870         // but we need to ensure it isn't deleted yet.
    871         RefPtr<HTMLDocumentParser> protect(this);
    872         pumpPendingSpeculations();
    873         return;
    874     }
    875 
    876     m_insertionPreloadScanner.clear();
    877     pumpTokenizerIfPossible(AllowYield);
    878     endIfDelayed();
    879 }
    880 
    881 void HTMLDocumentParser::watchForLoad(Resource* resource)
    882 {
    883     ASSERT(!resource->isLoaded());
    884     // addClient would call notifyFinished if the load were complete.
    885     // Callers do not expect to be re-entered from this call, so they should
    886     // not an already-loaded Resource.
    887     resource->addClient(this);
    888 }
    889 
    890 void HTMLDocumentParser::stopWatchingForLoad(Resource* resource)
    891 {
    892     resource->removeClient(this);
    893 }
    894 
    895 void HTMLDocumentParser::appendCurrentInputStreamToPreloadScannerAndScan()
    896 {
    897     ASSERT(m_preloadScanner);
    898     m_preloadScanner->appendToEnd(m_input.current());
    899     m_preloadScanner->scan(m_preloader.get(), document()->baseElementURL());
    900 }
    901 
    902 void HTMLDocumentParser::notifyFinished(Resource* cachedResource)
    903 {
    904     // pumpTokenizer can cause this parser to be detached from the Document,
    905     // but we need to ensure it isn't deleted yet.
    906     RefPtr<HTMLDocumentParser> protect(this);
    907 
    908     ASSERT(m_scriptRunner);
    909     ASSERT(!isExecutingScript());
    910     if (isStopping()) {
    911         attemptToRunDeferredScriptsAndEnd();
    912         return;
    913     }
    914 
    915     m_scriptRunner->executeScriptsWaitingForLoad(cachedResource);
    916     if (!isWaitingForScripts())
    917         resumeParsingAfterScriptExecution();
    918 }
    919 
    920 void HTMLDocumentParser::executeScriptsWaitingForResources()
    921 {
    922     // Document only calls this when the Document owns the DocumentParser
    923     // so this will not be called in the DocumentFragment case.
    924     ASSERT(m_scriptRunner);
    925     // Ignore calls unless we have a script blocking the parser waiting on a
    926     // stylesheet load.  Otherwise we are currently parsing and this
    927     // is a re-entrant call from encountering a </ style> tag.
    928     if (!m_scriptRunner->hasScriptsWaitingForResources())
    929         return;
    930 
    931     // pumpTokenizer can cause this parser to be detached from the Document,
    932     // but we need to ensure it isn't deleted yet.
    933     RefPtr<HTMLDocumentParser> protect(this);
    934     m_scriptRunner->executeScriptsWaitingForResources();
    935     if (!isWaitingForScripts())
    936         resumeParsingAfterScriptExecution();
    937 }
    938 
    939 void HTMLDocumentParser::parseDocumentFragment(const String& source, DocumentFragment* fragment, Element* contextElement, ParserContentPolicy parserContentPolicy)
    940 {
    941     RefPtr<HTMLDocumentParser> parser = HTMLDocumentParser::create(fragment, contextElement, parserContentPolicy);
    942     parser->insert(source); // Use insert() so that the parser will not yield.
    943     parser->finish();
    944     ASSERT(!parser->processingData()); // Make sure we're done. <rdar://problem/3963151>
    945     parser->detach(); // Allows ~DocumentParser to assert it was detached before destruction.
    946 }
    947 
    948 void HTMLDocumentParser::suspendScheduledTasks()
    949 {
    950     if (m_parserScheduler)
    951         m_parserScheduler->suspend();
    952 }
    953 
    954 void HTMLDocumentParser::resumeScheduledTasks()
    955 {
    956     if (m_parserScheduler)
    957         m_parserScheduler->resume();
    958 }
    959 
    960 }
    961