1 /* 2 * Copyright (C) 2013 Google, Inc. All Rights Reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #include "config.h" 27 #include "core/html/parser/BackgroundHTMLParser.h" 28 29 #include "core/html/parser/HTMLDocumentParser.h" 30 #include "core/html/parser/TextResourceDecoder.h" 31 #include "core/html/parser/XSSAuditor.h" 32 #include "wtf/MainThread.h" 33 #include "wtf/text/TextPosition.h" 34 35 namespace blink { 36 37 // On a network with high latency and high bandwidth, using a device 38 // with a fast CPU, we could end up speculatively tokenizing 39 // the whole document, well ahead of when the main-thread actually needs it. 40 // This is a waste of memory (and potentially time if the speculation fails). 41 // So we limit our outstanding tokens arbitrarily to 10,000. 42 // Our maximal memory spent speculating will be approximately: 43 // (outstandingTokenLimit + pendingTokenLimit) * sizeof(CompactToken) 44 // We use a separate low and high water mark to avoid constantly topping 45 // off the main thread's token buffer. 46 // At time of writing, this is (10000 + 1000) * 28 bytes = ~308kb of memory. 47 // These numbers have not been tuned. 48 static const size_t outstandingTokenLimit = 10000; 49 50 // We limit our chucks to 1000 tokens, to make sure the main 51 // thread is never waiting on the parser thread for tokens. 52 // This was tuned in https://bugs.webkit.org/show_bug.cgi?id=110408. 53 static const size_t pendingTokenLimit = 1000; 54 55 using namespace HTMLNames; 56 57 #if ENABLE(ASSERT) 58 59 static void checkThatTokensAreSafeToSendToAnotherThread(const CompactHTMLTokenStream* tokens) 60 { 61 for (size_t i = 0; i < tokens->size(); ++i) 62 ASSERT(tokens->at(i).isSafeToSendToAnotherThread()); 63 } 64 65 static void checkThatPreloadsAreSafeToSendToAnotherThread(const PreloadRequestStream& preloads) 66 { 67 for (size_t i = 0; i < preloads.size(); ++i) 68 ASSERT(preloads[i]->isSafeToSendToAnotherThread()); 69 } 70 71 static void checkThatXSSInfosAreSafeToSendToAnotherThread(const XSSInfoStream& infos) 72 { 73 for (size_t i = 0; i < infos.size(); ++i) 74 ASSERT(infos[i]->isSafeToSendToAnotherThread()); 75 } 76 77 #endif 78 79 void BackgroundHTMLParser::start(PassRefPtr<WeakReference<BackgroundHTMLParser> > reference, PassOwnPtr<Configuration> config) 80 { 81 new BackgroundHTMLParser(reference, config); 82 // Caller must free by calling stop(). 83 } 84 85 BackgroundHTMLParser::BackgroundHTMLParser(PassRefPtr<WeakReference<BackgroundHTMLParser> > reference, PassOwnPtr<Configuration> config) 86 : m_weakFactory(reference, this) 87 , m_token(adoptPtr(new HTMLToken)) 88 , m_tokenizer(HTMLTokenizer::create(config->options)) 89 , m_treeBuilderSimulator(config->options) 90 , m_options(config->options) 91 , m_parser(config->parser) 92 , m_pendingTokens(adoptPtr(new CompactHTMLTokenStream)) 93 , m_xssAuditor(config->xssAuditor.release()) 94 , m_preloadScanner(config->preloadScanner.release()) 95 , m_decoder(config->decoder.release()) 96 { 97 } 98 99 BackgroundHTMLParser::~BackgroundHTMLParser() 100 { 101 } 102 103 void BackgroundHTMLParser::appendRawBytesFromParserThread(const char* data, int dataLength) 104 { 105 ASSERT(m_decoder); 106 updateDocument(m_decoder->decode(data, dataLength)); 107 } 108 109 void BackgroundHTMLParser::appendRawBytesFromMainThread(PassOwnPtr<Vector<char> > buffer) 110 { 111 ASSERT(m_decoder); 112 updateDocument(m_decoder->decode(buffer->data(), buffer->size())); 113 } 114 115 void BackgroundHTMLParser::appendDecodedBytes(const String& input) 116 { 117 ASSERT(!m_input.current().isClosed()); 118 m_input.append(input); 119 pumpTokenizer(); 120 } 121 122 void BackgroundHTMLParser::setDecoder(PassOwnPtr<TextResourceDecoder> decoder) 123 { 124 ASSERT(decoder); 125 m_decoder = decoder; 126 } 127 128 void BackgroundHTMLParser::flush() 129 { 130 ASSERT(m_decoder); 131 updateDocument(m_decoder->flush()); 132 } 133 134 void BackgroundHTMLParser::updateDocument(const String& decodedData) 135 { 136 DocumentEncodingData encodingData(*m_decoder.get()); 137 138 if (encodingData != m_lastSeenEncodingData) { 139 m_lastSeenEncodingData = encodingData; 140 141 m_xssAuditor->setEncoding(encodingData.encoding()); 142 callOnMainThread(bind(&HTMLDocumentParser::didReceiveEncodingDataFromBackgroundParser, m_parser, encodingData)); 143 } 144 145 if (decodedData.isEmpty()) 146 return; 147 148 appendDecodedBytes(decodedData); 149 } 150 151 void BackgroundHTMLParser::resumeFrom(PassOwnPtr<Checkpoint> checkpoint) 152 { 153 m_parser = checkpoint->parser; 154 m_token = checkpoint->token.release(); 155 m_tokenizer = checkpoint->tokenizer.release(); 156 m_treeBuilderSimulator.setState(checkpoint->treeBuilderState); 157 m_input.rewindTo(checkpoint->inputCheckpoint, checkpoint->unparsedInput); 158 m_preloadScanner->rewindTo(checkpoint->preloadScannerCheckpoint); 159 pumpTokenizer(); 160 } 161 162 void BackgroundHTMLParser::startedChunkWithCheckpoint(HTMLInputCheckpoint inputCheckpoint) 163 { 164 // Note, we should not have to worry about the index being invalid 165 // as messages from the main thread will be processed in FIFO order. 166 m_input.invalidateCheckpointsBefore(inputCheckpoint); 167 pumpTokenizer(); 168 } 169 170 void BackgroundHTMLParser::finish() 171 { 172 markEndOfFile(); 173 pumpTokenizer(); 174 } 175 176 void BackgroundHTMLParser::stop() 177 { 178 delete this; 179 } 180 181 void BackgroundHTMLParser::forcePlaintextForTextDocument() 182 { 183 // This is only used by the TextDocumentParser (a subclass of HTMLDocumentParser) 184 // to force us into the PLAINTEXT state w/o using a <plaintext> tag. 185 // The TextDocumentParser uses a <pre> tag for historical/compatibility reasons. 186 m_tokenizer->setState(HTMLTokenizer::PLAINTEXTState); 187 } 188 189 void BackgroundHTMLParser::markEndOfFile() 190 { 191 ASSERT(!m_input.current().isClosed()); 192 m_input.append(String(&kEndOfFileMarker, 1)); 193 m_input.close(); 194 } 195 196 void BackgroundHTMLParser::pumpTokenizer() 197 { 198 // No need to start speculating until the main thread has almost caught up. 199 if (m_input.totalCheckpointTokenCount() > outstandingTokenLimit) 200 return; 201 202 while (true) { 203 m_sourceTracker.start(m_input.current(), m_tokenizer.get(), *m_token); 204 if (!m_tokenizer->nextToken(m_input.current(), *m_token)) { 205 // We've reached the end of our current input. 206 sendTokensToMainThread(); 207 break; 208 } 209 m_sourceTracker.end(m_input.current(), m_tokenizer.get(), *m_token); 210 211 { 212 TextPosition position = TextPosition(m_input.current().currentLine(), m_input.current().currentColumn()); 213 214 if (OwnPtr<XSSInfo> xssInfo = m_xssAuditor->filterToken(FilterTokenRequest(*m_token, m_sourceTracker, m_tokenizer->shouldAllowCDATA()))) { 215 xssInfo->m_textPosition = position; 216 m_pendingXSSInfos.append(xssInfo.release()); 217 } 218 219 CompactHTMLToken token(m_token.get(), TextPosition(m_input.current().currentLine(), m_input.current().currentColumn())); 220 221 m_preloadScanner->scan(token, m_input.current(), m_pendingPreloads); 222 223 m_pendingTokens->append(token); 224 } 225 226 m_token->clear(); 227 228 if (!m_treeBuilderSimulator.simulate(m_pendingTokens->last(), m_tokenizer.get()) || m_pendingTokens->size() >= pendingTokenLimit) { 229 sendTokensToMainThread(); 230 // If we're far ahead of the main thread, yield for a bit to avoid consuming too much memory. 231 if (m_input.totalCheckpointTokenCount() > outstandingTokenLimit) 232 break; 233 } 234 } 235 } 236 237 void BackgroundHTMLParser::sendTokensToMainThread() 238 { 239 if (m_pendingTokens->isEmpty()) 240 return; 241 242 #if ENABLE(ASSERT) 243 checkThatTokensAreSafeToSendToAnotherThread(m_pendingTokens.get()); 244 checkThatPreloadsAreSafeToSendToAnotherThread(m_pendingPreloads); 245 checkThatXSSInfosAreSafeToSendToAnotherThread(m_pendingXSSInfos); 246 #endif 247 248 OwnPtr<HTMLDocumentParser::ParsedChunk> chunk = adoptPtr(new HTMLDocumentParser::ParsedChunk); 249 chunk->preloads.swap(m_pendingPreloads); 250 chunk->xssInfos.swap(m_pendingXSSInfos); 251 chunk->tokenizerState = m_tokenizer->state(); 252 chunk->treeBuilderState = m_treeBuilderSimulator.state(); 253 chunk->inputCheckpoint = m_input.createCheckpoint(m_pendingTokens->size()); 254 chunk->preloadScannerCheckpoint = m_preloadScanner->createCheckpoint(); 255 chunk->tokens = m_pendingTokens.release(); 256 callOnMainThread(bind(&HTMLDocumentParser::didReceiveParsedChunkFromBackgroundParser, m_parser, chunk.release())); 257 258 m_pendingTokens = adoptPtr(new CompactHTMLTokenStream); 259 } 260 261 } 262