Home | History | Annotate | Download | only in html
      1 /*
      2     Copyright (C) 1997 Martin Jones (mjones (at) kde.org)
      3               (C) 1997 Torben Weis (weis (at) kde.org)
      4               (C) 1998 Waldo Bastian (bastian (at) kde.org)
      5               (C) 1999 Lars Knoll (knoll (at) kde.org)
      6               (C) 1999 Antti Koivisto (koivisto (at) kde.org)
      7               (C) 2001 Dirk Mueller (mueller (at) kde.org)
      8     Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
      9     Copyright (C) 2005, 2006 Alexey Proskuryakov (ap (at) nypop.com)
     10     Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmobile.com/)
     11 
     12     This library is free software; you can redistribute it and/or
     13     modify it under the terms of the GNU Library General Public
     14     License as published by the Free Software Foundation; either
     15     version 2 of the License, or (at your option) any later version.
     16 
     17     This library is distributed in the hope that it will be useful,
     18     but WITHOUT ANY WARRANTY; without even the implied warranty of
     19     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     20     Library General Public License for more details.
     21 
     22     You should have received a copy of the GNU Library General Public License
     23     along with this library; see the file COPYING.LIB.  If not, write to
     24     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     25     Boston, MA 02110-1301, USA.
     26 */
     27 
     28 #include "config.h"
     29 #include "HTMLTokenizer.h"
     30 
     31 #include "CSSHelper.h"
     32 #include "Cache.h"
     33 #include "CachedScript.h"
     34 #include "DocLoader.h"
     35 #include "DocumentFragment.h"
     36 #include "Event.h"
     37 #include "EventNames.h"
     38 #include "Frame.h"
     39 #include "FrameLoader.h"
     40 #include "FrameView.h"
     41 #include "HTMLElement.h"
     42 #include "HTMLNames.h"
     43 #include "HTMLParser.h"
     44 #include "HTMLScriptElement.h"
     45 #include "HTMLViewSourceDocument.h"
     46 #include "ImageLoader.h"
     47 #include "InspectorTimelineAgent.h"
     48 #include "MappedAttribute.h"
     49 #include "Page.h"
     50 #include "PreloadScanner.h"
     51 #include "ScriptController.h"
     52 #include "ScriptSourceCode.h"
     53 #include "ScriptValue.h"
     54 #include "XSSAuditor.h"
     55 #include <wtf/ASCIICType.h>
     56 #include <wtf/CurrentTime.h>
     57 
     58 #include "HTMLEntityNames.c"
     59 
     60 #ifdef ANDROID_INSTRUMENT
     61 #include "TimeCounter.h"
     62 #endif
     63 
     64 #define PRELOAD_SCANNER_ENABLED 1
     65 // #define INSTRUMENT_LAYOUT_SCHEDULING 1
     66 
     67 using namespace WTF;
     68 using namespace std;
     69 
     70 namespace WebCore {
     71 
     72 using namespace HTMLNames;
     73 
     74 #if MOBILE
     75 // The mobile device needs to be responsive, as such the tokenizer chunk size is reduced.
     76 // This value is used to define how many characters the tokenizer will process before
     77 // yeilding control.
     78 static const int defaultTokenizerChunkSize = 256;
     79 #else
     80 static const int defaultTokenizerChunkSize = 4096;
     81 #endif
     82 
     83 #if MOBILE
     84 // As the chunks are smaller (above), the tokenizer should not yield for as long a period, otherwise
     85 // it will take way to long to load a page.
     86 static const double defaultTokenizerTimeDelay = 0.300;
     87 #else
     88 // FIXME: We would like this constant to be 200ms.
     89 // Yielding more aggressively results in increased responsiveness and better incremental rendering.
     90 // It slows down overall page-load on slower machines, though, so for now we set a value of 500.
     91 static const double defaultTokenizerTimeDelay = 0.500;
     92 #endif
     93 
     94 static const char commentStart [] = "<!--";
     95 static const char doctypeStart [] = "<!doctype";
     96 static const char publicStart [] = "public";
     97 static const char systemStart [] = "system";
     98 static const char scriptEnd [] = "</script";
     99 static const char xmpEnd [] = "</xmp";
    100 static const char styleEnd [] =  "</style";
    101 static const char textareaEnd [] = "</textarea";
    102 static const char titleEnd [] = "</title";
    103 static const char iframeEnd [] = "</iframe";
    104 
    105 // Full support for MS Windows extensions to Latin-1.
    106 // Technically these extensions should only be activated for pages
    107 // marked "windows-1252" or "cp1252", but
    108 // in the standard Microsoft way, these extensions infect hundreds of thousands
    109 // of web pages.  Note that people with non-latin-1 Microsoft extensions
    110 // are SOL.
    111 //
    112 // See: http://www.microsoft.com/globaldev/reference/WinCP.asp
    113 //      http://www.bbsinc.com/iso8859.html
    114 //      http://www.obviously.com/
    115 //
    116 // There may be better equivalents
    117 
    118 // We only need this for entities. For non-entity text, we handle this in the text encoding.
    119 
    120 static const UChar windowsLatin1ExtensionArray[32] = {
    121     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
    122     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
    123     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
    124     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178  // 98-9F
    125 };
    126 
    127 static inline UChar fixUpChar(UChar c)
    128 {
    129     if ((c & ~0x1F) != 0x0080)
    130         return c;
    131     return windowsLatin1ExtensionArray[c - 0x80];
    132 }
    133 
    134 static inline bool tagMatch(const char* s1, const UChar* s2, unsigned length)
    135 {
    136     for (unsigned i = 0; i != length; ++i) {
    137         unsigned char c1 = s1[i];
    138         unsigned char uc1 = toASCIIUpper(static_cast<char>(c1));
    139         UChar c2 = s2[i];
    140         if (c1 != c2 && uc1 != c2)
    141             return false;
    142     }
    143     return true;
    144 }
    145 
    146 inline void Token::addAttribute(AtomicString& attrName, const AtomicString& attributeValue, bool viewSourceMode)
    147 {
    148     if (!attrName.isEmpty()) {
    149         ASSERT(!attrName.contains('/'));
    150         RefPtr<MappedAttribute> a = MappedAttribute::create(attrName, attributeValue);
    151         if (!attrs) {
    152             attrs = NamedMappedAttrMap::create();
    153             attrs->reserveInitialCapacity(10);
    154         }
    155         attrs->insertAttribute(a.release(), viewSourceMode);
    156     }
    157 
    158     attrName = emptyAtom;
    159 }
    160 
    161 // ----------------------------------------------------------------------------
    162 
    163 HTMLTokenizer::HTMLTokenizer(HTMLDocument* doc, bool reportErrors)
    164     : Tokenizer()
    165     , m_buffer(0)
    166     , m_scriptCode(0)
    167     , m_scriptCodeSize(0)
    168     , m_scriptCodeCapacity(0)
    169     , m_scriptCodeResync(0)
    170     , m_executingScript(0)
    171     , m_requestingScript(false)
    172     , m_hasScriptsWaitingForStylesheets(false)
    173     , m_timer(this, &HTMLTokenizer::timerFired)
    174     , m_externalScriptsTimer(this, &HTMLTokenizer::executeExternalScriptsTimerFired)
    175     , m_doc(doc)
    176     , m_parser(new HTMLParser(doc, reportErrors))
    177     , m_inWrite(false)
    178     , m_fragment(false)
    179     , m_scriptingPermission(FragmentScriptingAllowed)
    180 {
    181     begin();
    182 }
    183 
    184 HTMLTokenizer::HTMLTokenizer(HTMLViewSourceDocument* doc)
    185     : Tokenizer(true)
    186     , m_buffer(0)
    187     , m_scriptCode(0)
    188     , m_scriptCodeSize(0)
    189     , m_scriptCodeCapacity(0)
    190     , m_scriptCodeResync(0)
    191     , m_executingScript(0)
    192     , m_requestingScript(false)
    193     , m_hasScriptsWaitingForStylesheets(false)
    194     , m_timer(this, &HTMLTokenizer::timerFired)
    195     , m_externalScriptsTimer(this, &HTMLTokenizer::executeExternalScriptsTimerFired)
    196     , m_doc(doc)
    197     , m_parser(0)
    198     , m_inWrite(false)
    199     , m_fragment(false)
    200     , m_scriptingPermission(FragmentScriptingAllowed)
    201 {
    202     begin();
    203 }
    204 
    205 HTMLTokenizer::HTMLTokenizer(DocumentFragment* frag, FragmentScriptingPermission scriptingPermission)
    206     : m_buffer(0)
    207     , m_scriptCode(0)
    208     , m_scriptCodeSize(0)
    209     , m_scriptCodeCapacity(0)
    210     , m_scriptCodeResync(0)
    211     , m_executingScript(0)
    212     , m_requestingScript(false)
    213     , m_hasScriptsWaitingForStylesheets(false)
    214     , m_timer(this, &HTMLTokenizer::timerFired)
    215     , m_externalScriptsTimer(this, &HTMLTokenizer::executeExternalScriptsTimerFired)
    216     , m_doc(frag->document())
    217     , m_parser(new HTMLParser(frag, scriptingPermission))
    218     , m_inWrite(false)
    219     , m_fragment(true)
    220     , m_scriptingPermission(scriptingPermission)
    221 {
    222     begin();
    223 }
    224 
    225 void HTMLTokenizer::reset()
    226 {
    227     ASSERT(m_executingScript == 0);
    228 
    229     while (!m_pendingScripts.isEmpty()) {
    230         CachedScript* cs = m_pendingScripts.first().get();
    231         m_pendingScripts.removeFirst();
    232         ASSERT(cache()->disabled() || cs->accessCount() > 0);
    233         cs->removeClient(this);
    234     }
    235 
    236     fastFree(m_buffer);
    237     m_buffer = m_dest = 0;
    238     m_bufferSize = 0;
    239 
    240     fastFree(m_scriptCode);
    241     m_scriptCode = 0;
    242     m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;
    243 
    244     m_timer.stop();
    245     m_externalScriptsTimer.stop();
    246 
    247     m_state.setAllowYield(false);
    248     m_state.setForceSynchronous(false);
    249 
    250     m_currentToken.reset();
    251     m_doctypeToken.reset();
    252     m_doctypeSearchCount = 0;
    253     m_doctypeSecondarySearchCount = 0;
    254     m_hasScriptsWaitingForStylesheets = false;
    255 }
    256 
    257 void HTMLTokenizer::begin()
    258 {
    259     m_executingScript = 0;
    260     m_requestingScript = false;
    261     m_hasScriptsWaitingForStylesheets = false;
    262     m_state.setLoadingExtScript(false);
    263     reset();
    264     m_bufferSize = 254;
    265     m_buffer = static_cast<UChar*>(fastMalloc(sizeof(UChar) * 254));
    266     m_dest = m_buffer;
    267     tquote = NoQuote;
    268     searchCount = 0;
    269     m_state.setEntityState(NoEntity);
    270     m_scriptTagSrcAttrValue = String();
    271     m_pendingSrc.clear();
    272     m_currentPrependingSrc = 0;
    273     m_noMoreData = false;
    274     m_brokenComments = false;
    275     m_brokenServer = false;
    276     m_lineNumber = 0;
    277     m_currentScriptTagStartLineNumber = 0;
    278     m_currentTagStartLineNumber = 0;
    279     m_state.setForceSynchronous(false);
    280 
    281     Page* page = m_doc->page();
    282     if (page && page->hasCustomHTMLTokenizerTimeDelay())
    283         m_tokenizerTimeDelay = page->customHTMLTokenizerTimeDelay();
    284     else
    285         m_tokenizerTimeDelay = defaultTokenizerTimeDelay;
    286 
    287     if (page && page->hasCustomHTMLTokenizerChunkSize())
    288         m_tokenizerChunkSize = page->customHTMLTokenizerChunkSize();
    289     else
    290         m_tokenizerChunkSize = defaultTokenizerChunkSize;
    291 }
    292 
    293 void HTMLTokenizer::setForceSynchronous(bool force)
    294 {
    295     m_state.setForceSynchronous(force);
    296 }
    297 
    298 HTMLTokenizer::State HTMLTokenizer::processListing(SegmentedString list, State state)
    299 {
    300     // This function adds the listing 'list' as
    301     // preformatted text-tokens to the token-collection
    302     while (!list.isEmpty()) {
    303         if (state.skipLF()) {
    304             state.setSkipLF(false);
    305             if (*list == '\n') {
    306                 list.advance();
    307                 continue;
    308             }
    309         }
    310 
    311         checkBuffer();
    312 
    313         if (*list == '\n' || *list == '\r') {
    314             if (state.discardLF())
    315                 // Ignore this LF
    316                 state.setDiscardLF(false); // We have discarded 1 LF
    317             else
    318                 *m_dest++ = '\n';
    319 
    320             /* Check for MS-DOS CRLF sequence */
    321             if (*list == '\r')
    322                 state.setSkipLF(true);
    323 
    324             list.advance();
    325         } else {
    326             state.setDiscardLF(false);
    327             *m_dest++ = *list;
    328             list.advance();
    329         }
    330     }
    331 
    332     return state;
    333 }
    334 
    335 HTMLTokenizer::State HTMLTokenizer::parseNonHTMLText(SegmentedString& src, State state)
    336 {
    337     ASSERT(state.inTextArea() || state.inTitle() || state.inIFrame() || !state.hasEntityState());
    338     ASSERT(!state.hasTagState());
    339     ASSERT(state.inXmp() + state.inTextArea() + state.inTitle() + state.inStyle() + state.inScript() + state.inIFrame() == 1);
    340     if (state.inScript() && !m_currentScriptTagStartLineNumber)
    341         m_currentScriptTagStartLineNumber = m_lineNumber;
    342 
    343     if (state.inComment())
    344         state = parseComment(src, state);
    345 
    346     int lastDecodedEntityPosition = -1;
    347     while (!src.isEmpty()) {
    348         checkScriptBuffer();
    349         UChar ch = *src;
    350 
    351         if (!m_scriptCodeResync && !m_brokenComments &&
    352             !state.inXmp() && ch == '-' && m_scriptCodeSize >= 3 && !src.escaped() &&
    353             m_scriptCode[m_scriptCodeSize - 3] == '<' && m_scriptCode[m_scriptCodeSize - 2] == '!' && m_scriptCode[m_scriptCodeSize - 1] == '-' &&
    354             (lastDecodedEntityPosition < m_scriptCodeSize - 3)) {
    355             state.setInComment(true);
    356             state = parseComment(src, state);
    357             continue;
    358         }
    359         if (m_scriptCodeResync && !tquote && ch == '>') {
    360             src.advancePastNonNewline();
    361             m_scriptCodeSize = m_scriptCodeResync - 1;
    362             m_scriptCodeResync = 0;
    363             m_scriptCode[m_scriptCodeSize] = m_scriptCode[m_scriptCodeSize + 1] = 0;
    364             if (state.inScript())
    365                 state = scriptHandler(state);
    366             else {
    367                 state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state);
    368                 processToken();
    369                 if (state.inStyle()) {
    370                     m_currentToken.tagName = styleTag.localName();
    371                     m_currentToken.beginTag = false;
    372                 } else if (state.inTextArea()) {
    373                     m_currentToken.tagName = textareaTag.localName();
    374                     m_currentToken.beginTag = false;
    375                 } else if (state.inTitle()) {
    376                     m_currentToken.tagName = titleTag.localName();
    377                     m_currentToken.beginTag = false;
    378                 } else if (state.inXmp()) {
    379                     m_currentToken.tagName = xmpTag.localName();
    380                     m_currentToken.beginTag = false;
    381                 } else if (state.inIFrame()) {
    382                     m_currentToken.tagName = iframeTag.localName();
    383                     m_currentToken.beginTag = false;
    384                 }
    385                 processToken();
    386                 state.setInStyle(false);
    387                 state.setInScript(false);
    388                 state.setInTextArea(false);
    389                 state.setInTitle(false);
    390                 state.setInXmp(false);
    391                 state.setInIFrame(false);
    392                 tquote = NoQuote;
    393                 m_scriptCodeSize = m_scriptCodeResync = 0;
    394             }
    395             return state;
    396         }
    397         // possible end of tagname, lets check.
    398         if (!m_scriptCodeResync && !state.escaped() && !src.escaped() && (ch == '>' || ch == '/' || isASCIISpace(ch)) &&
    399              m_scriptCodeSize >= m_searchStopperLength &&
    400              tagMatch(m_searchStopper, m_scriptCode + m_scriptCodeSize - m_searchStopperLength, m_searchStopperLength) &&
    401              (lastDecodedEntityPosition < m_scriptCodeSize - m_searchStopperLength)) {
    402             m_scriptCodeResync = m_scriptCodeSize-m_searchStopperLength+1;
    403             tquote = NoQuote;
    404             continue;
    405         }
    406         if (m_scriptCodeResync && !state.escaped()) {
    407             if (ch == '\"')
    408                 tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);
    409             else if (ch == '\'')
    410                 tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;
    411             else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))
    412                 tquote = NoQuote;
    413         }
    414         state.setEscaped(!state.escaped() && ch == '\\');
    415         if (!m_scriptCodeResync && (state.inTextArea() || state.inTitle() || state.inIFrame()) && !src.escaped() && ch == '&') {
    416             UChar* scriptCodeDest = m_scriptCode + m_scriptCodeSize;
    417             src.advancePastNonNewline();
    418             state = parseEntity(src, scriptCodeDest, state, m_cBufferPos, true, false);
    419             if (scriptCodeDest == m_scriptCode + m_scriptCodeSize)
    420                 lastDecodedEntityPosition = m_scriptCodeSize;
    421             else
    422                 m_scriptCodeSize = scriptCodeDest - m_scriptCode;
    423         } else {
    424             m_scriptCode[m_scriptCodeSize++] = ch;
    425             src.advance(m_lineNumber);
    426         }
    427     }
    428 
    429     return state;
    430 }
    431 
    432 HTMLTokenizer::State HTMLTokenizer::scriptHandler(State state)
    433 {
    434     // We are inside a <script>
    435     bool doScriptExec = false;
    436     int startLine = m_currentScriptTagStartLineNumber + 1; // Script line numbers are 1 based, HTMLTokenzier line numbers are 0 based
    437 
    438     // Reset m_currentScriptTagStartLineNumber to indicate that we've finished parsing the current script element
    439     m_currentScriptTagStartLineNumber = 0;
    440 
    441     // (Bugzilla 3837) Scripts following a frameset element should not execute or,
    442     // in the case of extern scripts, even load.
    443     bool followingFrameset = (m_doc->body() && m_doc->body()->hasTagName(framesetTag));
    444 
    445     CachedScript* cs = 0;
    446     // don't load external scripts for standalone documents (for now)
    447     if (!inViewSourceMode()) {
    448         if (!m_scriptTagSrcAttrValue.isEmpty() && m_doc->frame()) {
    449             // forget what we just got; load from src url instead
    450             if (!m_parser->skipMode() && !followingFrameset) {
    451 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
    452                 if (!m_doc->ownerElement())
    453                     printf("Requesting script at time %d\n", m_doc->elapsedTime());
    454 #endif
    455                 // The parser might have been stopped by for example a window.close call in an earlier script.
    456                 // If so, we don't want to load scripts.
    457                 if (!m_parserStopped && m_scriptNode->dispatchBeforeLoadEvent(m_scriptTagSrcAttrValue) &&
    458                     (cs = m_doc->docLoader()->requestScript(m_scriptTagSrcAttrValue, m_scriptTagCharsetAttrValue)))
    459                     m_pendingScripts.append(cs);
    460                 else
    461                     m_scriptNode = 0;
    462             } else
    463                 m_scriptNode = 0;
    464             m_scriptTagSrcAttrValue = String();
    465         } else {
    466             // Parse m_scriptCode containing <script> info
    467             doScriptExec = m_scriptNode->shouldExecuteAsJavaScript();
    468 #if ENABLE(XHTMLMP)
    469             if (!doScriptExec)
    470                 m_doc->setShouldProcessNoscriptElement(true);
    471 #endif
    472             m_scriptNode = 0;
    473         }
    474     }
    475 
    476     state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state);
    477     RefPtr<Node> node = processToken();
    478 
    479     if (node && m_scriptingPermission == FragmentScriptingNotAllowed) {
    480         ExceptionCode ec;
    481         node->remove(ec);
    482         node = 0;
    483     }
    484 
    485     String scriptString = node ? node->textContent() : "";
    486     m_currentToken.tagName = scriptTag.localName();
    487     m_currentToken.beginTag = false;
    488     processToken();
    489 
    490     state.setInScript(false);
    491     m_scriptCodeSize = m_scriptCodeResync = 0;
    492 
    493     // FIXME: The script should be syntax highlighted.
    494     if (inViewSourceMode())
    495         return state;
    496 
    497     SegmentedString* savedPrependingSrc = m_currentPrependingSrc;
    498     SegmentedString prependingSrc;
    499     m_currentPrependingSrc = &prependingSrc;
    500 
    501 #ifdef ANDROID_INSTRUMENT
    502     android::TimeCounter::recordNoCounter(android::TimeCounter::ParsingTimeCounter, __FUNCTION__);
    503 #endif
    504 
    505     if (!m_parser->skipMode() && !followingFrameset) {
    506         if (cs) {
    507             if (savedPrependingSrc)
    508                 savedPrependingSrc->append(m_src);
    509             else
    510                 m_pendingSrc.prepend(m_src);
    511             setSrc(SegmentedString());
    512 
    513             // the ref() call below may call notifyFinished if the script is already in cache,
    514             // and that mucks with the state directly, so we must write it back to the object.
    515             m_state = state;
    516             bool savedRequestingScript = m_requestingScript;
    517             m_requestingScript = true;
    518             cs->addClient(this);
    519             m_requestingScript = savedRequestingScript;
    520             state = m_state;
    521             // will be 0 if script was already loaded and ref() executed it
    522             if (!m_pendingScripts.isEmpty())
    523                 state.setLoadingExtScript(true);
    524         } else if (!m_fragment && doScriptExec) {
    525             if (!m_executingScript)
    526                 m_pendingSrc.prepend(m_src);
    527             else
    528                 prependingSrc = m_src;
    529             setSrc(SegmentedString());
    530             state = scriptExecution(ScriptSourceCode(scriptString, m_doc->frame() ? m_doc->frame()->document()->url() : KURL(), startLine), state);
    531         }
    532     }
    533 
    534 #ifdef ANDROID_INSTRUMENT
    535     android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter);
    536 #endif
    537 
    538     if (!m_executingScript && !state.loadingExtScript()) {
    539         m_src.append(m_pendingSrc);
    540         m_pendingSrc.clear();
    541     } else if (!prependingSrc.isEmpty()) {
    542         // restore first so that the write appends in the right place
    543         // (does not hurt to do it again below)
    544         m_currentPrependingSrc = savedPrependingSrc;
    545 
    546         // we need to do this slightly modified bit of one of the write() cases
    547         // because we want to prepend to m_pendingSrc rather than appending
    548         // if there's no previous prependingSrc
    549         if (!m_pendingScripts.isEmpty()) {
    550             if (m_currentPrependingSrc)
    551                 m_currentPrependingSrc->append(prependingSrc);
    552             else
    553                 m_pendingSrc.prepend(prependingSrc);
    554         } else {
    555             m_state = state;
    556             write(prependingSrc, false);
    557             state = m_state;
    558         }
    559     }
    560 
    561 #if PRELOAD_SCANNER_ENABLED
    562     if (!m_pendingScripts.isEmpty() && !m_executingScript) {
    563         if (!m_preloadScanner)
    564             m_preloadScanner.set(new PreloadScanner(m_doc));
    565         if (!m_preloadScanner->inProgress()) {
    566             m_preloadScanner->begin();
    567             m_preloadScanner->write(m_pendingSrc);
    568         }
    569     }
    570 #endif
    571     m_currentPrependingSrc = savedPrependingSrc;
    572 
    573     return state;
    574 }
    575 
    576 HTMLTokenizer::State HTMLTokenizer::scriptExecution(const ScriptSourceCode& sourceCode, State state)
    577 {
    578     if (m_fragment || !m_doc->frame())
    579         return state;
    580     m_executingScript++;
    581 
    582     SegmentedString* savedPrependingSrc = m_currentPrependingSrc;
    583     SegmentedString prependingSrc;
    584     m_currentPrependingSrc = &prependingSrc;
    585 
    586 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
    587     if (!m_doc->ownerElement())
    588         printf("beginning script execution at %d\n", m_doc->elapsedTime());
    589 #endif
    590 
    591     m_state = state;
    592     m_doc->frame()->script()->executeScript(sourceCode);
    593     state = m_state;
    594 
    595     state.setAllowYield(true);
    596 
    597 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
    598     if (!m_doc->ownerElement())
    599         printf("ending script execution at %d\n", m_doc->elapsedTime());
    600 #endif
    601 
    602     m_executingScript--;
    603 
    604     if (!m_executingScript && !state.loadingExtScript()) {
    605         m_pendingSrc.prepend(prependingSrc);
    606         m_src.append(m_pendingSrc);
    607         m_pendingSrc.clear();
    608     } else if (!prependingSrc.isEmpty()) {
    609         // restore first so that the write appends in the right place
    610         // (does not hurt to do it again below)
    611         m_currentPrependingSrc = savedPrependingSrc;
    612 
    613         // we need to do this slightly modified bit of one of the write() cases
    614         // because we want to prepend to m_pendingSrc rather than appending
    615         // if there's no previous prependingSrc
    616         if (!m_pendingScripts.isEmpty()) {
    617             if (m_currentPrependingSrc)
    618                 m_currentPrependingSrc->append(prependingSrc);
    619             else
    620                 m_pendingSrc.prepend(prependingSrc);
    621 
    622 #if PRELOAD_SCANNER_ENABLED
    623             // We are stuck waiting for another script. Lets check the source that
    624             // was just document.write()n for anything to load.
    625             PreloadScanner documentWritePreloadScanner(m_doc);
    626             documentWritePreloadScanner.begin();
    627             documentWritePreloadScanner.write(prependingSrc);
    628             documentWritePreloadScanner.end();
    629 #endif
    630         } else {
    631             m_state = state;
    632             write(prependingSrc, false);
    633             state = m_state;
    634         }
    635     }
    636 
    637     m_currentPrependingSrc = savedPrependingSrc;
    638 
    639     return state;
    640 }
    641 
    642 HTMLTokenizer::State HTMLTokenizer::parseComment(SegmentedString& src, State state)
    643 {
    644     // FIXME: Why does this code even run for comments inside <script> and <style>? This seems bogus.
    645     checkScriptBuffer(src.length());
    646     while (!src.isEmpty()) {
    647         UChar ch = *src;
    648         m_scriptCode[m_scriptCodeSize++] = ch;
    649         if (ch == '>') {
    650             bool handleBrokenComments = m_brokenComments && !(state.inScript() || state.inStyle());
    651             int endCharsCount = 1; // start off with one for the '>' character
    652             if (m_scriptCodeSize > 2 && m_scriptCode[m_scriptCodeSize-3] == '-' && m_scriptCode[m_scriptCodeSize-2] == '-') {
    653                 endCharsCount = 3;
    654             } else if (m_scriptCodeSize > 3 && m_scriptCode[m_scriptCodeSize-4] == '-' && m_scriptCode[m_scriptCodeSize-3] == '-' &&
    655                 m_scriptCode[m_scriptCodeSize-2] == '!') {
    656                 // Other browsers will accept --!> as a close comment, even though it's
    657                 // not technically valid.
    658                 endCharsCount = 4;
    659             }
    660             if (handleBrokenComments || endCharsCount > 1) {
    661                 src.advancePastNonNewline();
    662                 if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle() || state.inIFrame())) {
    663                     checkScriptBuffer();
    664                     m_scriptCode[m_scriptCodeSize] = 0;
    665                     m_scriptCode[m_scriptCodeSize + 1] = 0;
    666                     m_currentToken.tagName = commentAtom;
    667                     m_currentToken.beginTag = true;
    668                     state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize - endCharsCount), state);
    669                     processToken();
    670                     m_currentToken.tagName = commentAtom;
    671                     m_currentToken.beginTag = false;
    672                     processToken();
    673                     m_scriptCodeSize = 0;
    674                 }
    675                 state.setInComment(false);
    676                 return state; // Finished parsing comment
    677             }
    678         }
    679         src.advance(m_lineNumber);
    680     }
    681 
    682     return state;
    683 }
    684 
    685 HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state)
    686 {
    687     checkScriptBuffer(src.length());
    688     while (!src.isEmpty()) {
    689         UChar ch = *src;
    690         m_scriptCode[m_scriptCodeSize++] = ch;
    691         if (ch == '>' && m_scriptCodeSize > 1 && m_scriptCode[m_scriptCodeSize - 2] == '%') {
    692             src.advancePastNonNewline();
    693             state.setInServer(false);
    694             m_scriptCodeSize = 0;
    695             return state; // Finished parsing server include
    696         }
    697         src.advance(m_lineNumber);
    698     }
    699     return state;
    700 }
    701 
    702 HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString& src, State state)
    703 {
    704     UChar oldchar = 0;
    705     while (!src.isEmpty()) {
    706         UChar chbegin = *src;
    707         if (chbegin == '\'')
    708             tquote = tquote == SingleQuote ? NoQuote : SingleQuote;
    709         else if (chbegin == '\"')
    710             tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;
    711         // Look for '?>'
    712         // Some crappy sites omit the "?" before it, so
    713         // we look for an unquoted '>' instead. (IE compatible)
    714         else if (chbegin == '>' && (!tquote || oldchar == '?')) {
    715             // We got a '?>' sequence
    716             state.setInProcessingInstruction(false);
    717             src.advancePastNonNewline();
    718             state.setDiscardLF(true);
    719             return state; // Finished parsing comment!
    720         }
    721         src.advance(m_lineNumber);
    722         oldchar = chbegin;
    723     }
    724 
    725     return state;
    726 }
    727 
    728 HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString& src, State state)
    729 {
    730     while (!src.isEmpty()) {
    731         UChar cc = *src;
    732 
    733         if (state.skipLF()) {
    734             state.setSkipLF(false);
    735             if (cc == '\n') {
    736                 src.advancePastNewline(m_lineNumber);
    737                 continue;
    738             }
    739         }
    740 
    741         // do we need to enlarge the buffer?
    742         checkBuffer();
    743 
    744         if (cc == '\r') {
    745             state.setSkipLF(true);
    746             *m_dest++ = '\n';
    747         } else
    748             *m_dest++ = cc;
    749         src.advance(m_lineNumber);
    750     }
    751 
    752     return state;
    753 }
    754 
    755 
    756 HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString& src, UChar*& dest, State state, unsigned& cBufferPos, bool start, bool parsingTag)
    757 {
    758     if (start) {
    759         cBufferPos = 0;
    760         state.setEntityState(SearchEntity);
    761         EntityUnicodeValue = 0;
    762     }
    763 
    764     while (!src.isEmpty()) {
    765         UChar cc = *src;
    766         switch (state.entityState()) {
    767         case NoEntity:
    768             ASSERT(state.entityState() != NoEntity);
    769             return state;
    770 
    771         case SearchEntity:
    772             if (cc == '#') {
    773                 m_cBuffer[cBufferPos++] = cc;
    774                 src.advancePastNonNewline();
    775                 state.setEntityState(NumericSearch);
    776             } else
    777                 state.setEntityState(EntityName);
    778             break;
    779 
    780         case NumericSearch:
    781             if (cc == 'x' || cc == 'X') {
    782                 m_cBuffer[cBufferPos++] = cc;
    783                 src.advancePastNonNewline();
    784                 state.setEntityState(Hexadecimal);
    785             } else if (cc >= '0' && cc <= '9')
    786                 state.setEntityState(Decimal);
    787             else
    788                 state.setEntityState(SearchSemicolon);
    789             break;
    790 
    791         case Hexadecimal: {
    792             int ll = min(src.length(), 10 - cBufferPos);
    793             while (ll--) {
    794                 cc = *src;
    795                 if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) {
    796                     state.setEntityState(SearchSemicolon);
    797                     break;
    798                 }
    799                 int digit;
    800                 if (cc < 'A')
    801                     digit = cc - '0';
    802                 else
    803                     digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch
    804                 EntityUnicodeValue = EntityUnicodeValue * 16 + digit;
    805                 m_cBuffer[cBufferPos++] = cc;
    806                 src.advancePastNonNewline();
    807             }
    808             if (cBufferPos == 10)
    809                 state.setEntityState(SearchSemicolon);
    810             break;
    811         }
    812         case Decimal:
    813         {
    814             int ll = min(src.length(), 9-cBufferPos);
    815             while (ll--) {
    816                 cc = *src;
    817 
    818                 if (!(cc >= '0' && cc <= '9')) {
    819                     state.setEntityState(SearchSemicolon);
    820                     break;
    821                 }
    822 
    823                 EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');
    824                 m_cBuffer[cBufferPos++] = cc;
    825                 src.advancePastNonNewline();
    826             }
    827             if (cBufferPos == 9)
    828                 state.setEntityState(SearchSemicolon);
    829             break;
    830         }
    831         case EntityName:
    832         {
    833             int ll = min(src.length(), 9-cBufferPos);
    834             while (ll--) {
    835                 cc = *src;
    836 
    837                 if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {
    838                     state.setEntityState(SearchSemicolon);
    839                     break;
    840                 }
    841 
    842                 m_cBuffer[cBufferPos++] = cc;
    843                 src.advancePastNonNewline();
    844             }
    845             if (cBufferPos == 9)
    846                 state.setEntityState(SearchSemicolon);
    847             if (state.entityState() == SearchSemicolon) {
    848                 if (cBufferPos > 1) {
    849                     // Since the maximum length of entity name is 9,
    850                     // so a single char array which is allocated on
    851                     // the stack, its length is 10, should be OK.
    852                     // Also if we have an illegal character, we treat it
    853                     // as illegal entity name.
    854                     unsigned testedEntityNameLen = 0;
    855                     char tmpEntityNameBuffer[10];
    856 
    857                     ASSERT(cBufferPos < 10);
    858                     for (; testedEntityNameLen < cBufferPos; ++testedEntityNameLen) {
    859                         if (m_cBuffer[testedEntityNameLen] > 0x7e)
    860                             break;
    861                         tmpEntityNameBuffer[testedEntityNameLen] = m_cBuffer[testedEntityNameLen];
    862                     }
    863 
    864                     const Entity *e;
    865 
    866                     if (testedEntityNameLen == cBufferPos)
    867                         e = findEntity(tmpEntityNameBuffer, cBufferPos);
    868                     else
    869                         e = 0;
    870 
    871                     if (e)
    872                         EntityUnicodeValue = e->code;
    873 
    874                     // be IE compatible
    875                     if (parsingTag && EntityUnicodeValue > 255 && *src != ';')
    876                         EntityUnicodeValue = 0;
    877                 }
    878             }
    879             else
    880                 break;
    881         }
    882         case SearchSemicolon:
    883             // Don't allow values that are more than 21 bits.
    884             if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) {
    885                 if (!inViewSourceMode()) {
    886                     if (*src == ';')
    887                         src.advancePastNonNewline();
    888                     if (EntityUnicodeValue <= 0xFFFF) {
    889                         checkBuffer();
    890                         src.push(fixUpChar(EntityUnicodeValue));
    891                     } else {
    892                         // Convert to UTF-16, using surrogate code points.
    893                         checkBuffer(2);
    894                         src.push(U16_LEAD(EntityUnicodeValue));
    895                         src.push(U16_TRAIL(EntityUnicodeValue));
    896                     }
    897                 } else {
    898                     // FIXME: We should eventually colorize entities by sending them as a special token.
    899                     // 12 bytes required: up to 10 bytes in m_cBuffer plus the
    900                     // leading '&' and trailing ';'
    901                     checkBuffer(12);
    902                     *dest++ = '&';
    903                     for (unsigned i = 0; i < cBufferPos; i++)
    904                         dest[i] = m_cBuffer[i];
    905                     dest += cBufferPos;
    906                     if (*src == ';') {
    907                         *dest++ = ';';
    908                         src.advancePastNonNewline();
    909                     }
    910                 }
    911             } else {
    912                 // 11 bytes required: up to 10 bytes in m_cBuffer plus the
    913                 // leading '&'
    914                 checkBuffer(11);
    915                 // ignore the sequence, add it to the buffer as plaintext
    916                 *dest++ = '&';
    917                 for (unsigned i = 0; i < cBufferPos; i++)
    918                     dest[i] = m_cBuffer[i];
    919                 dest += cBufferPos;
    920             }
    921 
    922             state.setEntityState(NoEntity);
    923             return state;
    924         }
    925     }
    926 
    927     return state;
    928 }
    929 
    930 HTMLTokenizer::State HTMLTokenizer::parseDoctype(SegmentedString& src, State state)
    931 {
    932     ASSERT(state.inDoctype());
    933     while (!src.isEmpty() && state.inDoctype()) {
    934         UChar c = *src;
    935         bool isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' ';
    936         switch (m_doctypeToken.state()) {
    937             case DoctypeBegin: {
    938                 m_doctypeToken.setState(DoctypeBeforeName);
    939                 if (isWhitespace) {
    940                     src.advance(m_lineNumber);
    941                     if (inViewSourceMode())
    942                         m_doctypeToken.m_source.append(c);
    943                 }
    944                 break;
    945             }
    946             case DoctypeBeforeName: {
    947                 if (c == '>') {
    948                     // Malformed.  Just exit.
    949                     src.advancePastNonNewline();
    950                     state.setInDoctype(false);
    951                     if (inViewSourceMode())
    952                         processDoctypeToken();
    953                 } else if (isWhitespace) {
    954                     src.advance(m_lineNumber);
    955                     if (inViewSourceMode())
    956                         m_doctypeToken.m_source.append(c);
    957                 } else
    958                     m_doctypeToken.setState(DoctypeName);
    959                 break;
    960             }
    961             case DoctypeName: {
    962                 if (c == '>') {
    963                     // Valid doctype. Emit it.
    964                     src.advancePastNonNewline();
    965                     state.setInDoctype(false);
    966                     processDoctypeToken();
    967                 } else if (isWhitespace) {
    968                     m_doctypeSearchCount = 0; // Used now to scan for PUBLIC
    969                     m_doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM
    970                     m_doctypeToken.setState(DoctypeAfterName);
    971                     src.advance(m_lineNumber);
    972                     if (inViewSourceMode())
    973                         m_doctypeToken.m_source.append(c);
    974                 } else {
    975                     src.advancePastNonNewline();
    976                     m_doctypeToken.m_name.append(c);
    977                     if (inViewSourceMode())
    978                         m_doctypeToken.m_source.append(c);
    979                 }
    980                 break;
    981             }
    982             case DoctypeAfterName: {
    983                 if (c == '>') {
    984                     // Valid doctype. Emit it.
    985                     src.advancePastNonNewline();
    986                     state.setInDoctype(false);
    987                     processDoctypeToken();
    988                 } else if (!isWhitespace) {
    989                     src.advancePastNonNewline();
    990                     if (toASCIILower(c) == publicStart[m_doctypeSearchCount]) {
    991                         m_doctypeSearchCount++;
    992                         if (m_doctypeSearchCount == 6)
    993                             // Found 'PUBLIC' sequence
    994                             m_doctypeToken.setState(DoctypeBeforePublicID);
    995                     } else if (m_doctypeSearchCount > 0) {
    996                         m_doctypeSearchCount = 0;
    997                         m_doctypeToken.setState(DoctypeBogus);
    998                     } else if (toASCIILower(c) == systemStart[m_doctypeSecondarySearchCount]) {
    999                         m_doctypeSecondarySearchCount++;
   1000                         if (m_doctypeSecondarySearchCount == 6)
   1001                             // Found 'SYSTEM' sequence
   1002                             m_doctypeToken.setState(DoctypeBeforeSystemID);
   1003                     } else {
   1004                         m_doctypeSecondarySearchCount = 0;
   1005                         m_doctypeToken.setState(DoctypeBogus);
   1006                     }
   1007                     if (inViewSourceMode())
   1008                         m_doctypeToken.m_source.append(c);
   1009                 } else {
   1010                     src.advance(m_lineNumber); // Whitespace keeps us in the after name state.
   1011                     if (inViewSourceMode())
   1012                         m_doctypeToken.m_source.append(c);
   1013                 }
   1014                 break;
   1015             }
   1016             case DoctypeBeforePublicID: {
   1017                 if (c == '\"' || c == '\'') {
   1018                     tquote = c == '\"' ? DoubleQuote : SingleQuote;
   1019                     m_doctypeToken.setState(DoctypePublicID);
   1020                     src.advancePastNonNewline();
   1021                     if (inViewSourceMode())
   1022                         m_doctypeToken.m_source.append(c);
   1023                 } else if (c == '>') {
   1024                     // Considered bogus.  Don't process the doctype.
   1025                     src.advancePastNonNewline();
   1026                     state.setInDoctype(false);
   1027                     if (inViewSourceMode())
   1028                         processDoctypeToken();
   1029                 } else if (isWhitespace) {
   1030                     src.advance(m_lineNumber);
   1031                     if (inViewSourceMode())
   1032                         m_doctypeToken.m_source.append(c);
   1033                 } else
   1034                     m_doctypeToken.setState(DoctypeBogus);
   1035                 break;
   1036             }
   1037             case DoctypePublicID: {
   1038                 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
   1039                     src.advancePastNonNewline();
   1040                     m_doctypeToken.setState(DoctypeAfterPublicID);
   1041                     if (inViewSourceMode())
   1042                         m_doctypeToken.m_source.append(c);
   1043                 } else if (c == '>') {
   1044                      // Considered bogus.  Don't process the doctype.
   1045                     src.advancePastNonNewline();
   1046                     state.setInDoctype(false);
   1047                     if (inViewSourceMode())
   1048                         processDoctypeToken();
   1049                 } else {
   1050                     m_doctypeToken.m_publicID.append(c);
   1051                     src.advance(m_lineNumber);
   1052                     if (inViewSourceMode())
   1053                         m_doctypeToken.m_source.append(c);
   1054                 }
   1055                 break;
   1056             }
   1057             case DoctypeAfterPublicID:
   1058                 if (c == '\"' || c == '\'') {
   1059                     tquote = c == '\"' ? DoubleQuote : SingleQuote;
   1060                     m_doctypeToken.setState(DoctypeSystemID);
   1061                     src.advancePastNonNewline();
   1062                     if (inViewSourceMode())
   1063                         m_doctypeToken.m_source.append(c);
   1064                 } else if (c == '>') {
   1065                     // Valid doctype. Emit it now.
   1066                     src.advancePastNonNewline();
   1067                     state.setInDoctype(false);
   1068                     processDoctypeToken();
   1069                 } else if (isWhitespace) {
   1070                     src.advance(m_lineNumber);
   1071                     if (inViewSourceMode())
   1072                         m_doctypeToken.m_source.append(c);
   1073                 } else
   1074                     m_doctypeToken.setState(DoctypeBogus);
   1075                 break;
   1076             case DoctypeBeforeSystemID:
   1077                 if (c == '\"' || c == '\'') {
   1078                     tquote = c == '\"' ? DoubleQuote : SingleQuote;
   1079                     m_doctypeToken.setState(DoctypeSystemID);
   1080                     src.advancePastNonNewline();
   1081                     if (inViewSourceMode())
   1082                         m_doctypeToken.m_source.append(c);
   1083                 } else if (c == '>') {
   1084                     // Considered bogus.  Don't process the doctype.
   1085                     src.advancePastNonNewline();
   1086                     state.setInDoctype(false);
   1087                 } else if (isWhitespace) {
   1088                     src.advance(m_lineNumber);
   1089                     if (inViewSourceMode())
   1090                         m_doctypeToken.m_source.append(c);
   1091                 } else
   1092                     m_doctypeToken.setState(DoctypeBogus);
   1093                 break;
   1094             case DoctypeSystemID:
   1095                 if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {
   1096                     src.advancePastNonNewline();
   1097                     m_doctypeToken.setState(DoctypeAfterSystemID);
   1098                     if (inViewSourceMode())
   1099                         m_doctypeToken.m_source.append(c);
   1100                 } else if (c == '>') {
   1101                      // Considered bogus.  Don't process the doctype.
   1102                     src.advancePastNonNewline();
   1103                     state.setInDoctype(false);
   1104                     if (inViewSourceMode())
   1105                         processDoctypeToken();
   1106                 } else {
   1107                     m_doctypeToken.m_systemID.append(c);
   1108                     src.advance(m_lineNumber);
   1109                     if (inViewSourceMode())
   1110                         m_doctypeToken.m_source.append(c);
   1111                 }
   1112                 break;
   1113             case DoctypeAfterSystemID:
   1114                 if (c == '>') {
   1115                     // Valid doctype. Emit it now.
   1116                     src.advancePastNonNewline();
   1117                     state.setInDoctype(false);
   1118                     processDoctypeToken();
   1119                 } else if (isWhitespace) {
   1120                     src.advance(m_lineNumber);
   1121                     if (inViewSourceMode())
   1122                         m_doctypeToken.m_source.append(c);
   1123                 } else
   1124                     m_doctypeToken.setState(DoctypeBogus);
   1125                 break;
   1126             case DoctypeBogus:
   1127                 if (c == '>') {
   1128                     // Done with the bogus doctype.
   1129                     src.advancePastNonNewline();
   1130                     state.setInDoctype(false);
   1131                     if (inViewSourceMode())
   1132                        processDoctypeToken();
   1133                 } else {
   1134                     src.advance(m_lineNumber); // Just keep scanning for '>'
   1135                     if (inViewSourceMode())
   1136                         m_doctypeToken.m_source.append(c);
   1137                 }
   1138                 break;
   1139             default:
   1140                 break;
   1141         }
   1142     }
   1143     return state;
   1144 }
   1145 
   1146 HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString& src, State state)
   1147 {
   1148     ASSERT(!state.hasEntityState());
   1149 
   1150     unsigned cBufferPos = m_cBufferPos;
   1151 
   1152     bool lastIsSlash = false;
   1153 
   1154     while (!src.isEmpty()) {
   1155         checkBuffer();
   1156         switch (state.tagState()) {
   1157         case NoTag:
   1158         {
   1159             m_cBufferPos = cBufferPos;
   1160             return state;
   1161         }
   1162         case TagName:
   1163         {
   1164             if (searchCount > 0) {
   1165                 if (*src == commentStart[searchCount]) {
   1166                     searchCount++;
   1167                     if (searchCount == 2)
   1168                         m_doctypeSearchCount++; // A '!' is also part of a doctype, so we are moving through that still as well.
   1169                     else
   1170                         m_doctypeSearchCount = 0;
   1171                     if (searchCount == 4) {
   1172                         // Found '<!--' sequence
   1173                         src.advancePastNonNewline();
   1174                         m_dest = m_buffer; // ignore the previous part of this tag
   1175                         state.setInComment(true);
   1176                         state.setTagState(NoTag);
   1177 
   1178                         // Fix bug 34302 at kde.bugs.org.  Go ahead and treat
   1179                         // <!--> as a valid comment, since both mozilla and IE on windows
   1180                         // can handle this case.  Only do this in quirks mode. -dwh
   1181                         if (!src.isEmpty() && *src == '>' && m_doc->inCompatMode()) {
   1182                             state.setInComment(false);
   1183                             src.advancePastNonNewline();
   1184                             if (!src.isEmpty())
   1185                                 m_cBuffer[cBufferPos++] = *src;
   1186                         } else
   1187                           state = parseComment(src, state);
   1188 
   1189                         m_cBufferPos = cBufferPos;
   1190                         return state; // Finished parsing tag!
   1191                     }
   1192                     m_cBuffer[cBufferPos++] = *src;
   1193                     src.advancePastNonNewline();
   1194                     break;
   1195                 } else
   1196                     searchCount = 0; // Stop looking for '<!--' sequence
   1197             }
   1198 
   1199             if (m_doctypeSearchCount > 0) {
   1200                 if (toASCIILower(*src) == doctypeStart[m_doctypeSearchCount]) {
   1201                     m_doctypeSearchCount++;
   1202                     m_cBuffer[cBufferPos++] = *src;
   1203                     src.advancePastNonNewline();
   1204                     if (m_doctypeSearchCount == 9) {
   1205                         // Found '<!DOCTYPE' sequence
   1206                         state.setInDoctype(true);
   1207                         state.setTagState(NoTag);
   1208                         m_doctypeToken.reset();
   1209                         if (inViewSourceMode())
   1210                             m_doctypeToken.m_source.append(m_cBuffer, cBufferPos);
   1211                         state = parseDoctype(src, state);
   1212                         m_cBufferPos = cBufferPos;
   1213                         return state;
   1214                     }
   1215                     break;
   1216                 } else
   1217                     m_doctypeSearchCount = 0; // Stop looking for '<!DOCTYPE' sequence
   1218             }
   1219 
   1220             bool finish = false;
   1221             unsigned int ll = min(src.length(), CBUFLEN - cBufferPos);
   1222             while (ll--) {
   1223                 UChar curchar = *src;
   1224                 if (isASCIISpace(curchar) || curchar == '>' || curchar == '<') {
   1225                     finish = true;
   1226                     break;
   1227                 }
   1228 
   1229                 // tolower() shows up on profiles. This is faster!
   1230                 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
   1231                     m_cBuffer[cBufferPos++] = curchar + ('a' - 'A');
   1232                 else
   1233                     m_cBuffer[cBufferPos++] = curchar;
   1234                 src.advancePastNonNewline();
   1235             }
   1236 
   1237             // Disadvantage: we add the possible rest of the tag
   1238             // as attribute names. ### judge if this causes problems
   1239             if (finish || CBUFLEN == cBufferPos) {
   1240                 bool beginTag;
   1241                 UChar* ptr = m_cBuffer;
   1242                 unsigned int len = cBufferPos;
   1243                 m_cBuffer[cBufferPos] = '\0';
   1244                 if ((cBufferPos > 0) && (*ptr == '/')) {
   1245                     // End Tag
   1246                     beginTag = false;
   1247                     ptr++;
   1248                     len--;
   1249                 }
   1250                 else
   1251                     // Start Tag
   1252                     beginTag = true;
   1253 
   1254                 // Ignore the / in fake xml tags like <br/>.  We trim off the "/" so that we'll get "br" as the tag name and not "br/".
   1255                 if (len > 1 && ptr[len-1] == '/' && !inViewSourceMode())
   1256                     ptr[--len] = '\0';
   1257 
   1258                 // Now that we've shaved off any invalid / that might have followed the name), make the tag.
   1259                 // FIXME: FireFox and WinIE turn !foo nodes into comments, we ignore comments. (fast/parser/tag-with-exclamation-point.html)
   1260                 if (ptr[0] != '!' || inViewSourceMode()) {
   1261                     m_currentToken.tagName = AtomicString(ptr);
   1262                     m_currentToken.beginTag = beginTag;
   1263                 }
   1264                 m_dest = m_buffer;
   1265                 state.setTagState(SearchAttribute);
   1266                 cBufferPos = 0;
   1267             }
   1268             break;
   1269         }
   1270         case SearchAttribute:
   1271             while (!src.isEmpty()) {
   1272                 UChar curchar = *src;
   1273                 // In this mode just ignore any quotes we encounter and treat them like spaces.
   1274                 if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"') {
   1275                     if (curchar == '<' || curchar == '>')
   1276                         state.setTagState(SearchEnd);
   1277                     else
   1278                         state.setTagState(AttributeName);
   1279 
   1280                     cBufferPos = 0;
   1281                     break;
   1282                 }
   1283                 if (inViewSourceMode())
   1284                     m_currentToken.addViewSourceChar(curchar);
   1285                 src.advance(m_lineNumber);
   1286             }
   1287             break;
   1288         case AttributeName:
   1289         {
   1290             m_rawAttributeBeforeValue.clear();
   1291             int ll = min(src.length(), CBUFLEN - cBufferPos);
   1292             while (ll--) {
   1293                 UChar curchar = *src;
   1294                 // If we encounter a "/" when scanning an attribute name, treat it as a delimiter.  This allows the
   1295                 // cases like <input type=checkbox checked/> to work (and accommodates XML-style syntax as per HTML5).
   1296                 if (curchar <= '>' && (curchar >= '<' || isASCIISpace(curchar) || curchar == '/')) {
   1297                     m_cBuffer[cBufferPos] = '\0';
   1298                     m_attrName = AtomicString(m_cBuffer);
   1299                     m_dest = m_buffer;
   1300                     *m_dest++ = 0;
   1301                     state.setTagState(SearchEqual);
   1302                     if (inViewSourceMode())
   1303                         m_currentToken.addViewSourceChar('a');
   1304                     break;
   1305                 }
   1306 
   1307                 // tolower() shows up on profiles. This is faster!
   1308                 if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())
   1309                     m_cBuffer[cBufferPos++] = curchar + ('a' - 'A');
   1310                 else
   1311                     m_cBuffer[cBufferPos++] = curchar;
   1312 
   1313                 m_rawAttributeBeforeValue.append(curchar);
   1314                 src.advance(m_lineNumber);
   1315             }
   1316             if (cBufferPos == CBUFLEN) {
   1317                 m_cBuffer[cBufferPos] = '\0';
   1318                 m_attrName = AtomicString(m_cBuffer);
   1319                 m_dest = m_buffer;
   1320                 *m_dest++ = 0;
   1321                 state.setTagState(SearchEqual);
   1322                 if (inViewSourceMode())
   1323                     m_currentToken.addViewSourceChar('a');
   1324             }
   1325             break;
   1326         }
   1327         case SearchEqual:
   1328             while (!src.isEmpty()) {
   1329                 UChar curchar = *src;
   1330 
   1331                 if (lastIsSlash && curchar == '>') {
   1332                     // This is a quirk (with a long sad history).  We have to do this
   1333                     // since widgets do <script src="foo.js"/> and expect the tag to close.
   1334                     if (m_currentToken.tagName == scriptTag)
   1335                         m_currentToken.selfClosingTag = true;
   1336                     m_currentToken.brokenXMLStyle = true;
   1337                 }
   1338 
   1339                 // In this mode just ignore any quotes or slashes we encounter and treat them like spaces.
   1340                 if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"' && curchar != '/') {
   1341                     if (curchar == '=') {
   1342                         state.setTagState(SearchValue);
   1343                         if (inViewSourceMode())
   1344                             m_currentToken.addViewSourceChar(curchar);
   1345                         m_rawAttributeBeforeValue.append(curchar);
   1346                         src.advancePastNonNewline();
   1347                     } else {
   1348                         m_currentToken.addAttribute(m_attrName, emptyAtom, inViewSourceMode());
   1349                         m_dest = m_buffer;
   1350                         state.setTagState(SearchAttribute);
   1351                         lastIsSlash = false;
   1352                     }
   1353                     break;
   1354                 }
   1355 
   1356                 lastIsSlash = curchar == '/';
   1357 
   1358                 if (inViewSourceMode())
   1359                     m_currentToken.addViewSourceChar(curchar);
   1360                 m_rawAttributeBeforeValue.append(curchar);
   1361                 src.advance(m_lineNumber);
   1362             }
   1363             break;
   1364         case SearchValue:
   1365             while (!src.isEmpty()) {
   1366                 UChar curchar = *src;
   1367                 if (!isASCIISpace(curchar)) {
   1368                     if (curchar == '\'' || curchar == '\"') {
   1369                         tquote = curchar == '\"' ? DoubleQuote : SingleQuote;
   1370                         state.setTagState(QuotedValue);
   1371                         if (inViewSourceMode())
   1372                             m_currentToken.addViewSourceChar(curchar);
   1373                         m_rawAttributeBeforeValue.append(curchar);
   1374                         src.advancePastNonNewline();
   1375                     } else
   1376                         state.setTagState(Value);
   1377 
   1378                     break;
   1379                 }
   1380                 if (inViewSourceMode())
   1381                     m_currentToken.addViewSourceChar(curchar);
   1382                 m_rawAttributeBeforeValue.append(curchar);
   1383                 src.advance(m_lineNumber);
   1384             }
   1385             break;
   1386         case QuotedValue:
   1387             while (!src.isEmpty()) {
   1388                 checkBuffer();
   1389 
   1390                 UChar curchar = *src;
   1391                 if (curchar <= '>' && !src.escaped()) {
   1392                     if (curchar == '>' && m_attrName.isEmpty()) {
   1393                         // Handle a case like <img '>.  Just go ahead and be willing
   1394                         // to close the whole tag.  Don't consume the character and
   1395                         // just go back into SearchEnd while ignoring the whole
   1396                         // value.
   1397                         // FIXME: Note that this is actually not a very good solution.
   1398                         // It doesn't handle the general case of
   1399                         // unmatched quotes among attributes that have names. -dwh
   1400                         while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r'))
   1401                             m_dest--; // remove trailing newlines
   1402                         AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);
   1403                         if (!attributeValue.contains('/'))
   1404                             m_attrName = attributeValue; // Just make the name/value match. (FIXME: Is this some WinIE quirk?)
   1405                         m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());
   1406                         if (inViewSourceMode())
   1407                             m_currentToken.addViewSourceChar('x');
   1408                         state.setTagState(SearchAttribute);
   1409                         m_dest = m_buffer;
   1410                         tquote = NoQuote;
   1411                         break;
   1412                     }
   1413 
   1414                     if (curchar == '&') {
   1415                         src.advancePastNonNewline();
   1416                         state = parseEntity(src, m_dest, state, cBufferPos, true, true);
   1417                         break;
   1418                     }
   1419 
   1420                     if ((tquote == SingleQuote && curchar == '\'') || (tquote == DoubleQuote && curchar == '\"')) {
   1421                         // some <input type=hidden> rely on trailing spaces. argh
   1422                         while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r'))
   1423                             m_dest--; // remove trailing newlines
   1424                         AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);
   1425                         if (m_attrName.isEmpty() && !attributeValue.contains('/')) {
   1426                             m_attrName = attributeValue; // Make the name match the value. (FIXME: Is this a WinIE quirk?)
   1427                             if (inViewSourceMode())
   1428                                 m_currentToken.addViewSourceChar('x');
   1429                         } else if (inViewSourceMode())
   1430                             m_currentToken.addViewSourceChar('v');
   1431 
   1432                         if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode() && m_attrName == srcAttr) {
   1433                             String context(m_rawAttributeBeforeValue.data(), m_rawAttributeBeforeValue.size());
   1434                             if (m_XSSAuditor && !m_XSSAuditor->canLoadExternalScriptFromSrc(context, attributeValue))
   1435                                 attributeValue = blankURL().string();
   1436                         }
   1437 
   1438                         m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());
   1439                         m_dest = m_buffer;
   1440                         state.setTagState(SearchAttribute);
   1441                         tquote = NoQuote;
   1442                         if (inViewSourceMode())
   1443                             m_currentToken.addViewSourceChar(curchar);
   1444                         src.advancePastNonNewline();
   1445                         break;
   1446                     }
   1447                 }
   1448 
   1449                 *m_dest++ = curchar;
   1450                 src.advance(m_lineNumber);
   1451             }
   1452             break;
   1453         case Value:
   1454             while (!src.isEmpty()) {
   1455                 checkBuffer();
   1456                 UChar curchar = *src;
   1457                 if (curchar <= '>' && !src.escaped()) {
   1458                     // parse Entities
   1459                     if (curchar == '&') {
   1460                         src.advancePastNonNewline();
   1461                         state = parseEntity(src, m_dest, state, cBufferPos, true, true);
   1462                         break;
   1463                     }
   1464                     // no quotes. Every space means end of value
   1465                     // '/' does not delimit in IE!
   1466                     if (isASCIISpace(curchar) || curchar == '>') {
   1467                         AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);
   1468 
   1469                         if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode() && m_attrName == srcAttr) {
   1470                             String context(m_rawAttributeBeforeValue.data(), m_rawAttributeBeforeValue.size());
   1471                             if (m_XSSAuditor && !m_XSSAuditor->canLoadExternalScriptFromSrc(context, attributeValue))
   1472                                 attributeValue = blankURL().string();
   1473                         }
   1474 
   1475                         m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());
   1476                         if (inViewSourceMode())
   1477                             m_currentToken.addViewSourceChar('v');
   1478                         m_dest = m_buffer;
   1479                         state.setTagState(SearchAttribute);
   1480                         break;
   1481                     }
   1482                 }
   1483 
   1484                 *m_dest++ = curchar;
   1485                 src.advance(m_lineNumber);
   1486             }
   1487             break;
   1488         case SearchEnd:
   1489         {
   1490             while (!src.isEmpty()) {
   1491                 UChar ch = *src;
   1492                 if (ch == '>' || ch == '<')
   1493                     break;
   1494                 if (ch == '/')
   1495                     m_currentToken.selfClosingTag = true;
   1496                 if (inViewSourceMode())
   1497                     m_currentToken.addViewSourceChar(ch);
   1498                 src.advance(m_lineNumber);
   1499             }
   1500             if (src.isEmpty())
   1501                 break;
   1502 
   1503             searchCount = 0; // Stop looking for '<!--' sequence
   1504             state.setTagState(NoTag);
   1505             tquote = NoQuote;
   1506 
   1507             if (*src != '<')
   1508                 src.advance(m_lineNumber);
   1509 
   1510             if (m_currentToken.tagName == nullAtom) { //stop if tag is unknown
   1511                 m_cBufferPos = cBufferPos;
   1512                 return state;
   1513             }
   1514 
   1515             AtomicString tagName = m_currentToken.tagName;
   1516 
   1517             // Handle <script src="foo"/> like Mozilla/Opera. We have to do this now for Dashboard
   1518             // compatibility.
   1519             bool isSelfClosingScript = m_currentToken.selfClosingTag && m_currentToken.beginTag && m_currentToken.tagName == scriptTag;
   1520             bool beginTag = !m_currentToken.selfClosingTag && m_currentToken.beginTag;
   1521             if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode()) {
   1522                 Attribute* a = 0;
   1523                 m_scriptTagSrcAttrValue = String();
   1524                 m_scriptTagCharsetAttrValue = String();
   1525                 if (m_currentToken.attrs && !m_fragment) {
   1526                     if (m_doc->frame() && m_doc->frame()->script()->canExecuteScripts()) {
   1527                         if ((a = m_currentToken.attrs->getAttributeItem(srcAttr)))
   1528                             m_scriptTagSrcAttrValue = m_doc->completeURL(deprecatedParseURL(a->value())).string();
   1529                     }
   1530                 }
   1531             }
   1532 
   1533             RefPtr<Node> n = processToken();
   1534             m_cBufferPos = cBufferPos;
   1535             if (n || inViewSourceMode()) {
   1536                 State savedState = state;
   1537                 SegmentedString savedSrc = src;
   1538                 long savedLineno = m_lineNumber;
   1539                 if ((tagName == preTag || tagName == listingTag) && !inViewSourceMode()) {
   1540                     if (beginTag)
   1541                         state.setDiscardLF(true); // Discard the first LF after we open a pre.
   1542                 } else if (tagName == scriptTag) {
   1543                     ASSERT(!m_scriptNode);
   1544                     m_scriptNode = static_pointer_cast<HTMLScriptElement>(n);
   1545                     if (m_scriptNode)
   1546                         m_scriptTagCharsetAttrValue = m_scriptNode->scriptCharset();
   1547                     if (beginTag) {
   1548                         m_searchStopper = scriptEnd;
   1549                         m_searchStopperLength = 8;
   1550                         state.setInScript(true);
   1551                         state = parseNonHTMLText(src, state);
   1552                     } else if (isSelfClosingScript) { // Handle <script src="foo"/>
   1553                         state.setInScript(true);
   1554                         state = scriptHandler(state);
   1555                     }
   1556                 } else if (tagName == styleTag) {
   1557                     if (beginTag) {
   1558                         m_searchStopper = styleEnd;
   1559                         m_searchStopperLength = 7;
   1560                         state.setInStyle(true);
   1561                         state = parseNonHTMLText(src, state);
   1562                     }
   1563                 } else if (tagName == textareaTag) {
   1564                     if (beginTag) {
   1565                         m_searchStopper = textareaEnd;
   1566                         m_searchStopperLength = 10;
   1567                         state.setInTextArea(true);
   1568                         state = parseNonHTMLText(src, state);
   1569                     }
   1570                 } else if (tagName == titleTag) {
   1571                     if (beginTag) {
   1572                         m_searchStopper = titleEnd;
   1573                         m_searchStopperLength = 7;
   1574                         state.setInTitle(true);
   1575                         state = parseNonHTMLText(src, state);
   1576                     }
   1577                 } else if (tagName == xmpTag) {
   1578                     if (beginTag) {
   1579                         m_searchStopper = xmpEnd;
   1580                         m_searchStopperLength = 5;
   1581                         state.setInXmp(true);
   1582                         state = parseNonHTMLText(src, state);
   1583                     }
   1584                 } else if (tagName == iframeTag) {
   1585                     if (beginTag) {
   1586                         m_searchStopper = iframeEnd;
   1587                         m_searchStopperLength = 8;
   1588                         state.setInIFrame(true);
   1589                         state = parseNonHTMLText(src, state);
   1590                     }
   1591                 }
   1592                 if (src.isEmpty() && (state.inTitle() || inViewSourceMode()) && !state.inComment() && !(state.inScript() && m_currentScriptTagStartLineNumber)) {
   1593                     // We just ate the rest of the document as the #text node under the special tag!
   1594                     // Reset the state then retokenize without special handling.
   1595                     // Let the parser clean up the missing close tag.
   1596                     // FIXME: This is incorrect, because src.isEmpty() doesn't mean we're
   1597                     // at the end of the document unless m_noMoreData is also true. We need
   1598                     // to detect this case elsewhere, and save the state somewhere other
   1599                     // than a local variable.
   1600                     state = savedState;
   1601                     src = savedSrc;
   1602                     m_lineNumber = savedLineno;
   1603                     m_scriptCodeSize = 0;
   1604                 }
   1605             }
   1606             if (tagName == plaintextTag)
   1607                 state.setInPlainText(beginTag);
   1608             return state; // Finished parsing tag!
   1609         }
   1610         } // end switch
   1611     }
   1612     m_cBufferPos = cBufferPos;
   1613     return state;
   1614 }
   1615 
   1616 inline bool HTMLTokenizer::continueProcessing(int& processedCount, double startTime, State &state)
   1617 {
   1618     // We don't want to be checking elapsed time with every character, so we only check after we've
   1619     // processed a certain number of characters.
   1620     bool allowedYield = state.allowYield();
   1621     state.setAllowYield(false);
   1622     if (!state.loadingExtScript() && !state.forceSynchronous() && !m_executingScript && (processedCount > m_tokenizerChunkSize || allowedYield)) {
   1623         processedCount = 0;
   1624         if (currentTime() - startTime > m_tokenizerTimeDelay) {
   1625             /* FIXME: We'd like to yield aggressively to give stylesheets the opportunity to
   1626                load, but this hurts overall performance on slower machines.  For now turn this
   1627                off.
   1628             || (!m_doc->haveStylesheetsLoaded() &&
   1629                 (m_doc->documentElement()->id() != ID_HTML || m_doc->body()))) {*/
   1630             // Schedule the timer to keep processing as soon as possible.
   1631             m_timer.startOneShot(0);
   1632 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
   1633             if (currentTime() - startTime > m_tokenizerTimeDelay)
   1634                 printf("Deferring processing of data because 500ms elapsed away from event loop.\n");
   1635 #endif
   1636             return false;
   1637         }
   1638     }
   1639 
   1640     processedCount++;
   1641     return true;
   1642 }
   1643 
   1644 void HTMLTokenizer::write(const SegmentedString& str, bool appendData)
   1645 {
   1646     if (!m_buffer)
   1647         return;
   1648 
   1649     if (m_parserStopped)
   1650         return;
   1651 
   1652     SegmentedString source(str);
   1653     if (m_executingScript)
   1654         source.setExcludeLineNumbers();
   1655 
   1656     if ((m_executingScript && appendData) || !m_pendingScripts.isEmpty()) {
   1657         // don't parse; we will do this later
   1658         if (m_currentPrependingSrc)
   1659             m_currentPrependingSrc->append(source);
   1660         else {
   1661             m_pendingSrc.append(source);
   1662 #if PRELOAD_SCANNER_ENABLED
   1663             if (m_preloadScanner && m_preloadScanner->inProgress() && appendData)
   1664                 m_preloadScanner->write(source);
   1665 #endif
   1666         }
   1667         return;
   1668     }
   1669 
   1670 #if PRELOAD_SCANNER_ENABLED
   1671     if (m_preloadScanner && m_preloadScanner->inProgress() && appendData)
   1672         m_preloadScanner->end();
   1673 #endif
   1674 
   1675     if (!m_src.isEmpty())
   1676         m_src.append(source);
   1677     else
   1678         setSrc(source);
   1679 
   1680     // Once a timer is set, it has control of when the tokenizer continues.
   1681     if (m_timer.isActive())
   1682         return;
   1683 
   1684     bool wasInWrite = m_inWrite;
   1685     m_inWrite = true;
   1686 
   1687 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
   1688     if (!m_doc->ownerElement())
   1689         printf("Beginning write at time %d\n", m_doc->elapsedTime());
   1690 #endif
   1691 
   1692     int processedCount = 0;
   1693     double startTime = currentTime();
   1694 #ifdef ANDROID_INSTRUMENT
   1695     android::TimeCounter::start(android::TimeCounter::ParsingTimeCounter);
   1696 #endif
   1697 
   1698 #if ENABLE(INSPECTOR)
   1699     if (InspectorTimelineAgent* timelineAgent = m_doc->inspectorTimelineAgent())
   1700         timelineAgent->willWriteHTML(source.length(), m_lineNumber);
   1701 #endif
   1702 
   1703     Frame* frame = m_doc->frame();
   1704 
   1705     State state = m_state;
   1706 
   1707     while (!m_src.isEmpty() && (!frame || !frame->redirectScheduler()->locationChangePending())) {
   1708         if (!continueProcessing(processedCount, startTime, state))
   1709             break;
   1710 
   1711         // do we need to enlarge the buffer?
   1712         checkBuffer();
   1713 
   1714         UChar cc = *m_src;
   1715 
   1716         bool wasSkipLF = state.skipLF();
   1717         if (wasSkipLF)
   1718             state.setSkipLF(false);
   1719 
   1720         if (wasSkipLF && (cc == '\n'))
   1721             m_src.advance();
   1722         else if (state.needsSpecialWriteHandling()) {
   1723             // it's important to keep needsSpecialWriteHandling with the flags this block tests
   1724             if (state.hasEntityState())
   1725                 state = parseEntity(m_src, m_dest, state, m_cBufferPos, false, state.hasTagState());
   1726             else if (state.inPlainText())
   1727                 state = parseText(m_src, state);
   1728             else if (state.inAnyNonHTMLText())
   1729                 state = parseNonHTMLText(m_src, state);
   1730             else if (state.inComment())
   1731                 state = parseComment(m_src, state);
   1732             else if (state.inDoctype())
   1733                 state = parseDoctype(m_src, state);
   1734             else if (state.inServer())
   1735                 state = parseServer(m_src, state);
   1736             else if (state.inProcessingInstruction())
   1737                 state = parseProcessingInstruction(m_src, state);
   1738             else if (state.hasTagState())
   1739                 state = parseTag(m_src, state);
   1740             else if (state.startTag()) {
   1741                 state.setStartTag(false);
   1742 
   1743                 switch (cc) {
   1744                 case '/':
   1745                     break;
   1746                 case '!': {
   1747                     // <!-- comment --> or <!DOCTYPE ...>
   1748                     searchCount = 1; // Look for '<!--' sequence to start comment or '<!DOCTYPE' sequence to start doctype
   1749                     m_doctypeSearchCount = 1;
   1750                     break;
   1751                 }
   1752                 case '?': {
   1753                     // xml processing instruction
   1754                     state.setInProcessingInstruction(true);
   1755                     tquote = NoQuote;
   1756                     state = parseProcessingInstruction(m_src, state);
   1757                     continue;
   1758 
   1759                     break;
   1760                 }
   1761                 case '%':
   1762                     if (!m_brokenServer) {
   1763                         // <% server stuff, handle as comment %>
   1764                         state.setInServer(true);
   1765                         tquote = NoQuote;
   1766                         state = parseServer(m_src, state);
   1767                         continue;
   1768                     }
   1769                     // else fall through
   1770                 default: {
   1771                     if ( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) {
   1772                         // Start of a Start-Tag
   1773                     } else {
   1774                         // Invalid tag
   1775                         // Add as is
   1776                         *m_dest = '<';
   1777                         m_dest++;
   1778                         continue;
   1779                     }
   1780                 }
   1781                 }; // end case
   1782 
   1783                 processToken();
   1784 
   1785                 m_cBufferPos = 0;
   1786                 state.setTagState(TagName);
   1787                 state = parseTag(m_src, state);
   1788             }
   1789         } else if (cc == '&' && !m_src.escaped()) {
   1790             m_src.advancePastNonNewline();
   1791             state = parseEntity(m_src, m_dest, state, m_cBufferPos, true, state.hasTagState());
   1792         } else if (cc == '<' && !m_src.escaped()) {
   1793             m_currentTagStartLineNumber = m_lineNumber;
   1794             m_src.advancePastNonNewline();
   1795             state.setStartTag(true);
   1796             state.setDiscardLF(false);
   1797         } else if (cc == '\n' || cc == '\r') {
   1798             if (state.discardLF())
   1799                 // Ignore this LF
   1800                 state.setDiscardLF(false); // We have discarded 1 LF
   1801             else {
   1802                 // Process this LF
   1803                 *m_dest++ = '\n';
   1804                 if (cc == '\r' && !m_src.excludeLineNumbers())
   1805                     m_lineNumber++;
   1806             }
   1807 
   1808             /* Check for MS-DOS CRLF sequence */
   1809             if (cc == '\r')
   1810                 state.setSkipLF(true);
   1811             m_src.advance(m_lineNumber);
   1812         } else {
   1813             state.setDiscardLF(false);
   1814             *m_dest++ = cc;
   1815             m_src.advancePastNonNewline();
   1816         }
   1817     }
   1818 
   1819 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
   1820     if (!m_doc->ownerElement())
   1821         printf("Ending write at time %d\n", m_doc->elapsedTime());
   1822 #endif
   1823 
   1824 #if ENABLE(INSPECTOR)
   1825     if (InspectorTimelineAgent* timelineAgent = m_doc->inspectorTimelineAgent())
   1826         timelineAgent->didWriteHTML(m_lineNumber);
   1827 #endif
   1828 
   1829     m_inWrite = wasInWrite;
   1830 
   1831     m_state = state;
   1832 
   1833 #ifdef ANDROID_INSTRUMENT
   1834     android::TimeCounter::record(android::TimeCounter::ParsingTimeCounter, __FUNCTION__);
   1835 #endif
   1836 
   1837     if (m_noMoreData && !m_inWrite && !state.loadingExtScript() && !m_executingScript && !m_timer.isActive())
   1838         end(); // this actually causes us to be deleted
   1839 
   1840     // After parsing, go ahead and dispatch image beforeload events.
   1841     ImageLoader::dispatchPendingBeforeLoadEvents();
   1842 }
   1843 
   1844 void HTMLTokenizer::stopParsing()
   1845 {
   1846     Tokenizer::stopParsing();
   1847     m_timer.stop();
   1848 
   1849     // The part needs to know that the tokenizer has finished with its data,
   1850     // regardless of whether it happened naturally or due to manual intervention.
   1851     if (!m_fragment && m_doc->frame())
   1852         m_doc->frame()->loader()->tokenizerProcessedData();
   1853 }
   1854 
   1855 bool HTMLTokenizer::processingData() const
   1856 {
   1857     return m_timer.isActive() || m_inWrite;
   1858 }
   1859 
   1860 void HTMLTokenizer::timerFired(Timer<HTMLTokenizer>*)
   1861 {
   1862 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
   1863     if (!m_doc->ownerElement())
   1864         printf("Beginning timer write at time %d\n", m_doc->elapsedTime());
   1865 #endif
   1866 
   1867     if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) {
   1868         // Restart the timer and let layout win.  This is basically a way of ensuring that the layout
   1869         // timer has higher priority than our timer.
   1870         m_timer.startOneShot(0);
   1871         return;
   1872     }
   1873 
   1874     // Invoke write() as though more data came in. This might cause us to get deleted.
   1875     write(SegmentedString(), true);
   1876 }
   1877 
   1878 void HTMLTokenizer::end()
   1879 {
   1880     ASSERT(!m_timer.isActive());
   1881     m_timer.stop(); // Only helps if assertion above fires, but do it anyway.
   1882 
   1883     if (m_buffer) {
   1884         // parseTag is using the buffer for different matters
   1885         if (!m_state.hasTagState())
   1886             processToken();
   1887 
   1888         fastFree(m_scriptCode);
   1889         m_scriptCode = 0;
   1890         m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;
   1891 
   1892         fastFree(m_buffer);
   1893         m_buffer = 0;
   1894     }
   1895 
   1896     if (!inViewSourceMode())
   1897         m_parser->finished();
   1898     else
   1899         m_doc->finishedParsing();
   1900 }
   1901 
   1902 void HTMLTokenizer::finish()
   1903 {
   1904     // do this as long as we don't find matching comment ends
   1905     while ((m_state.inComment() || m_state.inServer()) && m_scriptCode && m_scriptCodeSize) {
   1906         // we've found an unmatched comment start
   1907         if (m_state.inComment())
   1908             m_brokenComments = true;
   1909         else
   1910             m_brokenServer = true;
   1911         checkScriptBuffer();
   1912         m_scriptCode[m_scriptCodeSize] = 0;
   1913         m_scriptCode[m_scriptCodeSize + 1] = 0;
   1914         int pos;
   1915         String food;
   1916         if (m_state.inScript() || m_state.inStyle() || m_state.inTextArea())
   1917             food = String(m_scriptCode, m_scriptCodeSize);
   1918         else if (m_state.inServer()) {
   1919             food = "<";
   1920             food.append(m_scriptCode, m_scriptCodeSize);
   1921         } else {
   1922             pos = find(m_scriptCode, m_scriptCodeSize, '>');
   1923             food = String(m_scriptCode + pos + 1, m_scriptCodeSize - pos - 1);
   1924         }
   1925         fastFree(m_scriptCode);
   1926         m_scriptCode = 0;
   1927         m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;
   1928         m_state.setInComment(false);
   1929         m_state.setInServer(false);
   1930         if (!food.isEmpty())
   1931             write(food, true);
   1932     }
   1933     // this indicates we will not receive any more data... but if we are waiting on
   1934     // an external script to load, we can't finish parsing until that is done
   1935     m_noMoreData = true;
   1936     if (!m_inWrite && !m_state.loadingExtScript() && !m_executingScript && !m_timer.isActive())
   1937         end(); // this actually causes us to be deleted
   1938 }
   1939 
   1940 PassRefPtr<Node> HTMLTokenizer::processToken()
   1941 {
   1942     ScriptController* scriptController = (!m_fragment && m_doc->frame()) ? m_doc->frame()->script() : 0;
   1943     if (scriptController && scriptController->canExecuteScripts())
   1944         // FIXME: Why isn't this m_currentScriptTagStartLineNumber?  I suspect this is wrong.
   1945         scriptController->setEventHandlerLineNumber(m_currentTagStartLineNumber + 1); // Script line numbers are 1 based.
   1946     if (m_dest > m_buffer) {
   1947         m_currentToken.text = StringImpl::createStrippingNullCharacters(m_buffer, m_dest - m_buffer);
   1948         if (m_currentToken.tagName != commentAtom)
   1949             m_currentToken.tagName = textAtom;
   1950     } else if (m_currentToken.tagName == nullAtom) {
   1951         m_currentToken.reset();
   1952         if (scriptController)
   1953             scriptController->setEventHandlerLineNumber(m_lineNumber + 1); // Script line numbers are 1 based.
   1954         return 0;
   1955     }
   1956 
   1957     m_dest = m_buffer;
   1958 
   1959     RefPtr<Node> n;
   1960 
   1961     if (!m_parserStopped) {
   1962         if (NamedMappedAttrMap* map = m_currentToken.attrs.get())
   1963             map->shrinkToLength();
   1964         if (inViewSourceMode())
   1965             static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceToken(&m_currentToken);
   1966         else
   1967             // pass the token over to the parser, the parser DOES NOT delete the token
   1968             n = m_parser->parseToken(&m_currentToken);
   1969     }
   1970     m_currentToken.reset();
   1971     if (scriptController)
   1972         scriptController->setEventHandlerLineNumber(0);
   1973 
   1974     return n.release();
   1975 }
   1976 
   1977 void HTMLTokenizer::processDoctypeToken()
   1978 {
   1979     if (inViewSourceMode())
   1980         static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceDoctypeToken(&m_doctypeToken);
   1981     else
   1982         m_parser->parseDoctypeToken(&m_doctypeToken);
   1983 }
   1984 
   1985 HTMLTokenizer::~HTMLTokenizer()
   1986 {
   1987     ASSERT(!m_inWrite);
   1988     reset();
   1989 }
   1990 
   1991 
   1992 void HTMLTokenizer::enlargeBuffer(int len)
   1993 {
   1994     // Resize policy: Always at least double the size of the buffer each time.
   1995     int delta = max(len, m_bufferSize);
   1996 
   1997     // Check for overflow.
   1998     // For now, handle overflow the same way we handle fastRealloc failure, with CRASH.
   1999     static const int maxSize = INT_MAX / sizeof(UChar);
   2000     if (delta > maxSize - m_bufferSize)
   2001         CRASH();
   2002 
   2003     int newSize = m_bufferSize + delta;
   2004     int oldOffset = m_dest - m_buffer;
   2005     m_buffer = static_cast<UChar*>(fastRealloc(m_buffer, newSize * sizeof(UChar)));
   2006     m_dest = m_buffer + oldOffset;
   2007     m_bufferSize = newSize;
   2008 }
   2009 
   2010 void HTMLTokenizer::enlargeScriptBuffer(int len)
   2011 {
   2012     // Resize policy: Always at least double the size of the buffer each time.
   2013     int delta = max(len, m_scriptCodeCapacity);
   2014 
   2015     // Check for overflow.
   2016     // For now, handle overflow the same way we handle fastRealloc failure, with CRASH.
   2017     static const int maxSize = INT_MAX / sizeof(UChar);
   2018     if (delta > maxSize - m_scriptCodeCapacity)
   2019         CRASH();
   2020 
   2021     int newSize = m_scriptCodeCapacity + delta;
   2022     // If we allow fastRealloc(ptr, 0), it will call CRASH(). We run into this
   2023     // case if the HTML being parsed begins with "<!--" and there's more data
   2024     // coming.
   2025     if (!newSize) {
   2026         ASSERT(!m_scriptCode);
   2027         return;
   2028     }
   2029 
   2030     m_scriptCode = static_cast<UChar*>(fastRealloc(m_scriptCode, newSize * sizeof(UChar)));
   2031     m_scriptCodeCapacity = newSize;
   2032 }
   2033 
   2034 void HTMLTokenizer::executeScriptsWaitingForStylesheets()
   2035 {
   2036     ASSERT(m_doc->haveStylesheetsLoaded());
   2037 
   2038     if (m_hasScriptsWaitingForStylesheets)
   2039         notifyFinished(0);
   2040 }
   2041 
   2042 void HTMLTokenizer::notifyFinished(CachedResource*)
   2043 {
   2044     executeExternalScriptsIfReady();
   2045 }
   2046 
   2047 void HTMLTokenizer::executeExternalScriptsIfReady()
   2048 {
   2049 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
   2050     if (!m_doc->ownerElement())
   2051         printf("script loaded at %d\n", m_doc->elapsedTime());
   2052 #endif
   2053 
   2054     ASSERT(!m_pendingScripts.isEmpty());
   2055 
   2056     // Make external scripts wait for external stylesheets.
   2057     // FIXME: This needs to be done for inline scripts too.
   2058     m_hasScriptsWaitingForStylesheets = !m_doc->haveStylesheetsLoaded();
   2059     if (m_hasScriptsWaitingForStylesheets)
   2060         return;
   2061 
   2062     bool finished = false;
   2063 
   2064     double startTime = currentTime();
   2065     while (!finished && m_pendingScripts.first()->isLoaded()) {
   2066         if (!continueExecutingExternalScripts(startTime))
   2067             break;
   2068 
   2069         CachedScript* cs = m_pendingScripts.first().get();
   2070         m_pendingScripts.removeFirst();
   2071         ASSERT(cache()->disabled() || cs->accessCount() > 0);
   2072 
   2073         setSrc(SegmentedString());
   2074 
   2075         // make sure we forget about the script before we execute the new one
   2076         // infinite recursion might happen otherwise
   2077         ScriptSourceCode sourceCode(cs);
   2078         bool errorOccurred = cs->errorOccurred();
   2079         cs->removeClient(this);
   2080 
   2081         RefPtr<Node> n = m_scriptNode.release();
   2082 
   2083 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
   2084         if (!m_doc->ownerElement())
   2085             printf("external script beginning execution at %d\n", m_doc->elapsedTime());
   2086 #endif
   2087 
   2088         if (errorOccurred)
   2089             n->dispatchEvent(Event::create(eventNames().errorEvent, true, false));
   2090         else {
   2091             if (static_cast<HTMLScriptElement*>(n.get())->shouldExecuteAsJavaScript())
   2092                 m_state = scriptExecution(sourceCode, m_state);
   2093 #if ENABLE(XHTMLMP)
   2094             else
   2095                 m_doc->setShouldProcessNoscriptElement(true);
   2096 #endif
   2097             n->dispatchEvent(Event::create(eventNames().loadEvent, false, false));
   2098         }
   2099 
   2100         // The state of m_pendingScripts.isEmpty() can change inside the scriptExecution()
   2101         // call above, so test afterwards.
   2102         finished = m_pendingScripts.isEmpty();
   2103         if (finished) {
   2104             ASSERT(!m_hasScriptsWaitingForStylesheets);
   2105             m_state.setLoadingExtScript(false);
   2106 #ifdef INSTRUMENT_LAYOUT_SCHEDULING
   2107             if (!m_doc->ownerElement())
   2108                 printf("external script finished execution at %d\n", m_doc->elapsedTime());
   2109 #endif
   2110         } else if (m_hasScriptsWaitingForStylesheets) {
   2111             // m_hasScriptsWaitingForStylesheets flag might have changed during the script execution.
   2112             // If it did we are now blocked waiting for stylesheets and should not execute more scripts until they arrive.
   2113             finished = true;
   2114         }
   2115 
   2116         // 'm_requestingScript' is true when we are called synchronously from
   2117         // scriptHandler(). In that case scriptHandler() will take care
   2118         // of m_pendingSrc.
   2119         if (!m_requestingScript) {
   2120             SegmentedString rest = m_pendingSrc;
   2121             m_pendingSrc.clear();
   2122             write(rest, false);
   2123             // we might be deleted at this point, do not access any members.
   2124         }
   2125     }
   2126 }
   2127 
   2128 void HTMLTokenizer::executeExternalScriptsTimerFired(Timer<HTMLTokenizer>*)
   2129 {
   2130     if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) {
   2131         // Restart the timer and do layout first.
   2132         m_externalScriptsTimer.startOneShot(0);
   2133         return;
   2134     }
   2135 
   2136     // Continue executing external scripts.
   2137     executeExternalScriptsIfReady();
   2138 }
   2139 
   2140 bool HTMLTokenizer::continueExecutingExternalScripts(double startTime)
   2141 {
   2142     if (m_externalScriptsTimer.isActive())
   2143         return false;
   2144 
   2145     if (currentTime() - startTime > m_tokenizerTimeDelay) {
   2146         // Schedule the timer to keep processing as soon as possible.
   2147         m_externalScriptsTimer.startOneShot(0);
   2148         return false;
   2149     }
   2150     return true;
   2151 }
   2152 
   2153 bool HTMLTokenizer::isWaitingForScripts() const
   2154 {
   2155     return m_state.loadingExtScript();
   2156 }
   2157 
   2158 void HTMLTokenizer::setSrc(const SegmentedString& source)
   2159 {
   2160     m_src = source;
   2161 }
   2162 
   2163 void parseHTMLDocumentFragment(const String& source, DocumentFragment* fragment, FragmentScriptingPermission scriptingPermission)
   2164 {
   2165     HTMLTokenizer tok(fragment, scriptingPermission);
   2166     tok.setForceSynchronous(true);
   2167     tok.write(source, true);
   2168     tok.finish();
   2169     ASSERT(!tok.processingData());      // make sure we're done (see 3963151)
   2170 }
   2171 
   2172 UChar decodeNamedEntity(const char* name)
   2173 {
   2174     const Entity* e = findEntity(name, strlen(name));
   2175     return e ? e->code : 0;
   2176 }
   2177 
   2178 }
   2179