Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2013 Google, Inc. All Rights Reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL GOOGLE INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #include "config.h"
     27 #include "core/html/parser/BackgroundHTMLParser.h"
     28 
     29 #include "core/html/parser/HTMLDocumentParser.h"
     30 #include "core/html/parser/TextResourceDecoder.h"
     31 #include "core/html/parser/XSSAuditor.h"
     32 #include "wtf/MainThread.h"
     33 #include "wtf/text/TextPosition.h"
     34 
     35 namespace WebCore {
     36 
     37 // On a network with high latency and high bandwidth, using a device
     38 // with a fast CPU, we could end up speculatively tokenizing
     39 // the whole document, well ahead of when the main-thread actually needs it.
     40 // This is a waste of memory (and potentially time if the speculation fails).
     41 // So we limit our outstanding tokens arbitrarily to 10,000.
     42 // Our maximal memory spent speculating will be approximately:
     43 // (outstandingTokenLimit + pendingTokenLimit) * sizeof(CompactToken)
     44 // We use a separate low and high water mark to avoid constantly topping
     45 // off the main thread's token buffer.
     46 // At time of writing, this is (10000 + 1000) * 28 bytes = ~308kb of memory.
     47 // These numbers have not been tuned.
     48 static const size_t outstandingTokenLimit = 10000;
     49 
     50 // We limit our chucks to 1000 tokens, to make sure the main
     51 // thread is never waiting on the parser thread for tokens.
     52 // This was tuned in https://bugs.webkit.org/show_bug.cgi?id=110408.
     53 static const size_t pendingTokenLimit = 1000;
     54 
     55 using namespace HTMLNames;
     56 
     57 #ifndef NDEBUG
     58 
     59 static void checkThatTokensAreSafeToSendToAnotherThread(const CompactHTMLTokenStream* tokens)
     60 {
     61     for (size_t i = 0; i < tokens->size(); ++i)
     62         ASSERT(tokens->at(i).isSafeToSendToAnotherThread());
     63 }
     64 
     65 static void checkThatPreloadsAreSafeToSendToAnotherThread(const PreloadRequestStream& preloads)
     66 {
     67     for (size_t i = 0; i < preloads.size(); ++i)
     68         ASSERT(preloads[i]->isSafeToSendToAnotherThread());
     69 }
     70 
     71 static void checkThatXSSInfosAreSafeToSendToAnotherThread(const XSSInfoStream& infos)
     72 {
     73     for (size_t i = 0; i < infos.size(); ++i)
     74         ASSERT(infos[i]->isSafeToSendToAnotherThread());
     75 }
     76 
     77 #endif
     78 
     79 void BackgroundHTMLParser::start(PassRefPtr<WeakReference<BackgroundHTMLParser> > reference, PassOwnPtr<Configuration> config)
     80 {
     81     new BackgroundHTMLParser(reference, config);
     82     // Caller must free by calling stop().
     83 }
     84 
     85 BackgroundHTMLParser::BackgroundHTMLParser(PassRefPtr<WeakReference<BackgroundHTMLParser> > reference, PassOwnPtr<Configuration> config)
     86     : m_weakFactory(reference, this)
     87     , m_token(adoptPtr(new HTMLToken))
     88     , m_tokenizer(HTMLTokenizer::create(config->options))
     89     , m_treeBuilderSimulator(config->options)
     90     , m_options(config->options)
     91     , m_parser(config->parser)
     92     , m_pendingTokens(adoptPtr(new CompactHTMLTokenStream))
     93     , m_xssAuditor(config->xssAuditor.release())
     94     , m_preloadScanner(config->preloadScanner.release())
     95     , m_decoder(config->decoder.release())
     96 {
     97 }
     98 
     99 BackgroundHTMLParser::~BackgroundHTMLParser()
    100 {
    101 }
    102 
    103 void BackgroundHTMLParser::appendRawBytesFromParserThread(const char* data, int dataLength)
    104 {
    105     ASSERT(m_decoder);
    106     updateDocument(m_decoder->decode(data, dataLength));
    107 }
    108 
    109 void BackgroundHTMLParser::appendRawBytesFromMainThread(PassOwnPtr<Vector<char> > buffer)
    110 {
    111     ASSERT(m_decoder);
    112     updateDocument(m_decoder->decode(buffer->data(), buffer->size()));
    113 }
    114 
    115 void BackgroundHTMLParser::appendDecodedBytes(const String& input)
    116 {
    117     ASSERT(!m_input.current().isClosed());
    118     m_input.append(input);
    119     pumpTokenizer();
    120 }
    121 
    122 void BackgroundHTMLParser::setDecoder(PassOwnPtr<TextResourceDecoder> decoder)
    123 {
    124     ASSERT(decoder);
    125     m_decoder = decoder;
    126 }
    127 
    128 void BackgroundHTMLParser::flush()
    129 {
    130     ASSERT(m_decoder);
    131     updateDocument(m_decoder->flush());
    132 }
    133 
    134 void BackgroundHTMLParser::updateDocument(const String& decodedData)
    135 {
    136     DocumentEncodingData encodingData(*m_decoder.get());
    137 
    138     if (encodingData != m_lastSeenEncodingData) {
    139         m_lastSeenEncodingData = encodingData;
    140 
    141         m_xssAuditor->setEncoding(encodingData.encoding());
    142         callOnMainThread(bind(&HTMLDocumentParser::didReceiveEncodingDataFromBackgroundParser, m_parser, encodingData));
    143     }
    144 
    145     if (decodedData.isEmpty())
    146         return;
    147 
    148     appendDecodedBytes(decodedData);
    149 }
    150 
    151 void BackgroundHTMLParser::resumeFrom(PassOwnPtr<Checkpoint> checkpoint)
    152 {
    153     m_parser = checkpoint->parser;
    154     m_token = checkpoint->token.release();
    155     m_tokenizer = checkpoint->tokenizer.release();
    156     m_treeBuilderSimulator.setState(checkpoint->treeBuilderState);
    157     m_input.rewindTo(checkpoint->inputCheckpoint, checkpoint->unparsedInput);
    158     m_preloadScanner->rewindTo(checkpoint->preloadScannerCheckpoint);
    159     pumpTokenizer();
    160 }
    161 
    162 void BackgroundHTMLParser::startedChunkWithCheckpoint(HTMLInputCheckpoint inputCheckpoint)
    163 {
    164     // Note, we should not have to worry about the index being invalid
    165     // as messages from the main thread will be processed in FIFO order.
    166     m_input.invalidateCheckpointsBefore(inputCheckpoint);
    167     pumpTokenizer();
    168 }
    169 
    170 void BackgroundHTMLParser::finish()
    171 {
    172     markEndOfFile();
    173     pumpTokenizer();
    174 }
    175 
    176 void BackgroundHTMLParser::stop()
    177 {
    178     delete this;
    179 }
    180 
    181 void BackgroundHTMLParser::forcePlaintextForTextDocument()
    182 {
    183     // This is only used by the TextDocumentParser (a subclass of HTMLDocumentParser)
    184     // to force us into the PLAINTEXT state w/o using a <plaintext> tag.
    185     // The TextDocumentParser uses a <pre> tag for historical/compatibility reasons.
    186     m_tokenizer->setState(HTMLTokenizer::PLAINTEXTState);
    187 }
    188 
    189 void BackgroundHTMLParser::markEndOfFile()
    190 {
    191     ASSERT(!m_input.current().isClosed());
    192     m_input.append(String(&kEndOfFileMarker, 1));
    193     m_input.close();
    194 }
    195 
    196 void BackgroundHTMLParser::pumpTokenizer()
    197 {
    198     // No need to start speculating until the main thread has almost caught up.
    199     if (m_input.totalCheckpointTokenCount() > outstandingTokenLimit)
    200         return;
    201 
    202     while (true) {
    203         m_sourceTracker.start(m_input.current(), m_tokenizer.get(), *m_token);
    204         if (!m_tokenizer->nextToken(m_input.current(), *m_token)) {
    205             // We've reached the end of our current input.
    206             sendTokensToMainThread();
    207             break;
    208         }
    209         m_sourceTracker.end(m_input.current(), m_tokenizer.get(), *m_token);
    210 
    211         {
    212             TextPosition position = TextPosition(m_input.current().currentLine(), m_input.current().currentColumn());
    213 
    214             if (OwnPtr<XSSInfo> xssInfo = m_xssAuditor->filterToken(FilterTokenRequest(*m_token, m_sourceTracker, m_tokenizer->shouldAllowCDATA()))) {
    215                 xssInfo->m_textPosition = position;
    216                 m_pendingXSSInfos.append(xssInfo.release());
    217             }
    218 
    219             CompactHTMLToken token(m_token.get(), TextPosition(m_input.current().currentLine(), m_input.current().currentColumn()));
    220 
    221             m_preloadScanner->scan(token, m_input.current(), m_pendingPreloads);
    222 
    223             m_pendingTokens->append(token);
    224         }
    225 
    226         m_token->clear();
    227 
    228         if (!m_treeBuilderSimulator.simulate(m_pendingTokens->last(), m_tokenizer.get()) || m_pendingTokens->size() >= pendingTokenLimit) {
    229             sendTokensToMainThread();
    230             // If we're far ahead of the main thread, yield for a bit to avoid consuming too much memory.
    231             if (m_input.totalCheckpointTokenCount() > outstandingTokenLimit)
    232                 break;
    233         }
    234     }
    235 }
    236 
    237 void BackgroundHTMLParser::sendTokensToMainThread()
    238 {
    239     if (m_pendingTokens->isEmpty())
    240         return;
    241 
    242 #ifndef NDEBUG
    243     checkThatTokensAreSafeToSendToAnotherThread(m_pendingTokens.get());
    244     checkThatPreloadsAreSafeToSendToAnotherThread(m_pendingPreloads);
    245     checkThatXSSInfosAreSafeToSendToAnotherThread(m_pendingXSSInfos);
    246 #endif
    247 
    248     OwnPtr<HTMLDocumentParser::ParsedChunk> chunk = adoptPtr(new HTMLDocumentParser::ParsedChunk);
    249     chunk->preloads.swap(m_pendingPreloads);
    250     chunk->xssInfos.swap(m_pendingXSSInfos);
    251     chunk->tokenizerState = m_tokenizer->state();
    252     chunk->treeBuilderState = m_treeBuilderSimulator.state();
    253     chunk->inputCheckpoint = m_input.createCheckpoint(m_pendingTokens->size());
    254     chunk->preloadScannerCheckpoint = m_preloadScanner->createCheckpoint();
    255     chunk->tokens = m_pendingTokens.release();
    256     callOnMainThread(bind(&HTMLDocumentParser::didReceiveParsedChunkFromBackgroundParser, m_parser, chunk.release()));
    257 
    258     m_pendingTokens = adoptPtr(new CompactHTMLTokenStream);
    259 }
    260 
    261 }
    262