Home | History | Annotate | Download | only in html
      1 /*
      2     Copyright (C) 1997 Martin Jones (mjones (at) kde.org)
      3               (C) 1997 Torben Weis (weis (at) kde.org)
      4               (C) 1998 Waldo Bastian (bastian (at) kde.org)
      5               (C) 2001 Dirk Mueller (mueller (at) kde.org)
      6     Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
      7 
      8     This library is free software; you can redistribute it and/or
      9     modify it under the terms of the GNU Library General Public
     10     License as published by the Free Software Foundation; either
     11     version 2 of the License, or (at your option) any later version.
     12 
     13     This library is distributed in the hope that it will be useful,
     14     but WITHOUT ANY WARRANTY; without even the implied warranty of
     15     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16     Library General Public License for more details.
     17 
     18     You should have received a copy of the GNU Library General Public License
     19     along with this library; see the file COPYING.LIB.  If not, write to
     20     the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     21     Boston, MA 02110-1301, USA.
     22 */
     23 
     24 #ifndef HTMLTokenizer_h
     25 #define HTMLTokenizer_h
     26 
     27 #include "CachedResourceClient.h"
     28 #include "CachedResourceHandle.h"
     29 #include "NamedMappedAttrMap.h"
     30 #include "MappedAttributeEntry.h"
     31 #include "SegmentedString.h"
     32 #include "Timer.h"
     33 #include "Tokenizer.h"
     34 #include <wtf/Deque.h>
     35 #include <wtf/OwnPtr.h>
     36 #include <wtf/Vector.h>
     37 
     38 namespace WebCore {
     39 
     40 class CachedScript;
     41 class DocumentFragment;
     42 class Document;
     43 class HTMLDocument;
     44 class HTMLScriptElement;
     45 class HTMLViewSourceDocument;
     46 class FrameView;
     47 class HTMLParser;
     48 class Node;
     49 class PreloadScanner;
     50 class ScriptSourceCode;
     51 
     52 /**
     53  * @internal
     54  * represents one HTML tag. Consists of a numerical id, and the list
     55  * of attributes. Can also represent text. In this case the id = 0 and
     56  * text contains the text.
     57  */
     58 struct Token {
     59     Token()
     60         : beginTag(true)
     61         , selfClosingTag(false)
     62         , brokenXMLStyle(false)
     63         , m_sourceInfo(0)
     64     { }
     65     ~Token() { }
     66 
     67     void addAttribute(AtomicString& attrName, const AtomicString& v, bool viewSourceMode);
     68 
     69     bool isOpenTag(const QualifiedName& fullName) const { return beginTag && fullName.localName() == tagName; }
     70     bool isCloseTag(const QualifiedName& fullName) const { return !beginTag && fullName.localName() == tagName; }
     71 
     72     void reset()
     73     {
     74         attrs = 0;
     75         text = 0;
     76         tagName = nullAtom;
     77         beginTag = true;
     78         selfClosingTag = false;
     79         brokenXMLStyle = false;
     80         if (m_sourceInfo)
     81             m_sourceInfo->clear();
     82     }
     83 
     84     void addViewSourceChar(UChar c) { if (!m_sourceInfo.get()) m_sourceInfo.set(new Vector<UChar>); m_sourceInfo->append(c); }
     85 
     86     RefPtr<NamedMappedAttrMap> attrs;
     87     RefPtr<StringImpl> text;
     88     AtomicString tagName;
     89     bool beginTag;
     90     bool selfClosingTag;
     91     bool brokenXMLStyle;
     92     OwnPtr<Vector<UChar> > m_sourceInfo;
     93 };
     94 
     95 enum DoctypeState {
     96     DoctypeBegin,
     97     DoctypeBeforeName,
     98     DoctypeName,
     99     DoctypeAfterName,
    100     DoctypeBeforePublicID,
    101     DoctypePublicID,
    102     DoctypeAfterPublicID,
    103     DoctypeBeforeSystemID,
    104     DoctypeSystemID,
    105     DoctypeAfterSystemID,
    106     DoctypeBogus
    107 };
    108 
    109 class DoctypeToken {
    110 public:
    111     DoctypeToken() {}
    112 
    113     void reset()
    114     {
    115         m_name.clear();
    116         m_publicID.clear();
    117         m_systemID.clear();
    118         m_state = DoctypeBegin;
    119         m_source.clear();
    120     }
    121 
    122     DoctypeState state() { return m_state; }
    123     void setState(DoctypeState s) { m_state = s; }
    124 
    125     Vector<UChar> m_name;
    126     Vector<UChar> m_publicID;
    127     Vector<UChar> m_systemID;
    128     DoctypeState m_state;
    129 
    130     Vector<UChar> m_source;
    131 };
    132 
    133 //-----------------------------------------------------------------------------
    134 
    135 class HTMLTokenizer : public Tokenizer, public CachedResourceClient {
    136 public:
    137     HTMLTokenizer(HTMLDocument*, bool reportErrors);
    138     HTMLTokenizer(HTMLViewSourceDocument*);
    139     HTMLTokenizer(DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed);
    140     virtual ~HTMLTokenizer();
    141 
    142     virtual void write(const SegmentedString&, bool appendData);
    143     virtual void finish();
    144     virtual void setForceSynchronous(bool force);
    145     virtual bool isWaitingForScripts() const;
    146     virtual void stopParsing();
    147     virtual bool processingData() const;
    148     virtual int executingScript() const { return m_executingScript; }
    149 
    150     virtual int lineNumber() const { return m_lineNumber; }
    151     virtual int columnNumber() const { return 1; }
    152 
    153     bool processingContentWrittenByScript() const { return m_src.excludeLineNumbers(); }
    154 
    155     virtual void executeScriptsWaitingForStylesheets();
    156 
    157     virtual bool isHTMLTokenizer() const { return true; }
    158     HTMLParser* htmlParser() const { return m_parser.get(); }
    159 
    160 private:
    161     class State;
    162 
    163     // Where we are in parsing a tag
    164     void begin();
    165     void end();
    166 
    167     void reset();
    168 
    169     PassRefPtr<Node> processToken();
    170     void processDoctypeToken();
    171 
    172     State processListing(SegmentedString, State);
    173     State parseComment(SegmentedString&, State);
    174     State parseDoctype(SegmentedString&, State);
    175     State parseServer(SegmentedString&, State);
    176     State parseText(SegmentedString&, State);
    177     State parseNonHTMLText(SegmentedString&, State);
    178     State parseTag(SegmentedString&, State);
    179     State parseEntity(SegmentedString&, UChar*& dest, State, unsigned& cBufferPos, bool start, bool parsingTag);
    180     State parseProcessingInstruction(SegmentedString&, State);
    181     State scriptHandler(State);
    182     State scriptExecution(const ScriptSourceCode&, State);
    183     void setSrc(const SegmentedString&);
    184 
    185     // check if we have enough space in the buffer.
    186     // if not enlarge it
    187     inline void checkBuffer(int len = 10)
    188     {
    189         if ((m_dest - m_buffer) > m_bufferSize - len)
    190             enlargeBuffer(len);
    191     }
    192 
    193     inline void checkScriptBuffer(int len = 10)
    194     {
    195         if (m_scriptCodeSize + len >= m_scriptCodeCapacity)
    196             enlargeScriptBuffer(len);
    197     }
    198 
    199     void enlargeBuffer(int len);
    200     void enlargeScriptBuffer(int len);
    201 
    202     bool continueProcessing(int& processedCount, double startTime, State&);
    203     void timerFired(Timer<HTMLTokenizer>*);
    204     void allDataProcessed();
    205 
    206     // from CachedResourceClient
    207     void notifyFinished(CachedResource*);
    208 
    209     void executeExternalScriptsIfReady();
    210     void executeExternalScriptsTimerFired(Timer<HTMLTokenizer>*);
    211     bool continueExecutingExternalScripts(double startTime);
    212 
    213     // Internal buffers
    214     ///////////////////
    215     UChar* m_buffer;
    216     int m_bufferSize;
    217     UChar* m_dest;
    218 
    219     Token m_currentToken;
    220 
    221     // This buffer holds the raw characters we've seen between the beginning of
    222     // the attribute name and the first character of the attribute value.
    223     Vector<UChar, 32> m_rawAttributeBeforeValue;
    224 
    225     // Tokenizer flags
    226     //////////////////
    227     // are we in quotes within a html tag
    228     enum { NoQuote, SingleQuote, DoubleQuote } tquote;
    229 
    230     // Are we in a &... character entity description?
    231     enum EntityState {
    232         NoEntity = 0,
    233         SearchEntity = 1,
    234         NumericSearch = 2,
    235         Hexadecimal = 3,
    236         Decimal = 4,
    237         EntityName = 5,
    238         SearchSemicolon = 6
    239     };
    240     unsigned EntityUnicodeValue;
    241 
    242     enum TagState {
    243         NoTag = 0,
    244         TagName = 1,
    245         SearchAttribute = 2,
    246         AttributeName = 3,
    247         SearchEqual = 4,
    248         SearchValue = 5,
    249         QuotedValue = 6,
    250         Value = 7,
    251         SearchEnd = 8
    252     };
    253 
    254     class State {
    255     public:
    256         State() : m_bits(0) { }
    257 
    258         TagState tagState() const { return static_cast<TagState>(m_bits & TagMask); }
    259         void setTagState(TagState t) { m_bits = (m_bits & ~TagMask) | t; }
    260         EntityState entityState() const { return static_cast<EntityState>((m_bits & EntityMask) >> EntityShift); }
    261         void setEntityState(EntityState e) { m_bits = (m_bits & ~EntityMask) | (e << EntityShift); }
    262 
    263         bool inScript() const { return testBit(InScript); }
    264         void setInScript(bool v) { setBit(InScript, v); }
    265         bool inStyle() const { return testBit(InStyle); }
    266         void setInStyle(bool v) { setBit(InStyle, v); }
    267         bool inXmp() const { return testBit(InXmp); }
    268         void setInXmp(bool v) { setBit(InXmp, v); }
    269         bool inTitle() const { return testBit(InTitle); }
    270         void setInTitle(bool v) { setBit(InTitle, v); }
    271         bool inIFrame() const { return testBit(InIFrame); }
    272         void setInIFrame(bool v) { setBit(InIFrame, v); }
    273         bool inPlainText() const { return testBit(InPlainText); }
    274         void setInPlainText(bool v) { setBit(InPlainText, v); }
    275         bool inProcessingInstruction() const { return testBit(InProcessingInstruction); }
    276         void setInProcessingInstruction(bool v) { return setBit(InProcessingInstruction, v); }
    277         bool inComment() const { return testBit(InComment); }
    278         void setInComment(bool v) { setBit(InComment, v); }
    279         bool inDoctype() const { return testBit(InDoctype); }
    280         void setInDoctype(bool v) { setBit(InDoctype, v); }
    281         bool inTextArea() const { return testBit(InTextArea); }
    282         void setInTextArea(bool v) { setBit(InTextArea, v); }
    283         bool escaped() const { return testBit(Escaped); }
    284         void setEscaped(bool v) { setBit(Escaped, v); }
    285         bool inServer() const { return testBit(InServer); }
    286         void setInServer(bool v) { setBit(InServer, v); }
    287         bool skipLF() const { return testBit(SkipLF); }
    288         void setSkipLF(bool v) { setBit(SkipLF, v); }
    289         bool startTag() const { return testBit(StartTag); }
    290         void setStartTag(bool v) { setBit(StartTag, v); }
    291         bool discardLF() const { return testBit(DiscardLF); }
    292         void setDiscardLF(bool v) { setBit(DiscardLF, v); }
    293         bool allowYield() const { return testBit(AllowYield); }
    294         void setAllowYield(bool v) { setBit(AllowYield, v); }
    295         bool loadingExtScript() const { return testBit(LoadingExtScript); }
    296         void setLoadingExtScript(bool v) { setBit(LoadingExtScript, v); }
    297         bool forceSynchronous() const { return testBit(ForceSynchronous); }
    298         void setForceSynchronous(bool v) { setBit(ForceSynchronous, v); }
    299 
    300         bool inAnyNonHTMLText() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame); }
    301         bool hasTagState() const { return m_bits & TagMask; }
    302         bool hasEntityState() const { return m_bits & EntityMask; }
    303 
    304         bool needsSpecialWriteHandling() const { return m_bits & (InScript | InStyle | InXmp | InTextArea | InTitle | InIFrame | TagMask | EntityMask | InPlainText | InComment | InDoctype | InServer | InProcessingInstruction | StartTag); }
    305 
    306     private:
    307         static const int EntityShift = 4;
    308         enum StateBits {
    309             TagMask = (1 << 4) - 1,
    310             EntityMask = (1 << 7) - (1 << 4),
    311             InScript = 1 << 7,
    312             InStyle = 1 << 8,
    313             // Bit 9 unused
    314             InXmp = 1 << 10,
    315             InTitle = 1 << 11,
    316             InPlainText = 1 << 12,
    317             InProcessingInstruction = 1 << 13,
    318             InComment = 1 << 14,
    319             InTextArea = 1 << 15,
    320             Escaped = 1 << 16,
    321             InServer = 1 << 17,
    322             SkipLF = 1 << 18,
    323             StartTag = 1 << 19,
    324             DiscardLF = 1 << 20, // FIXME: should clarify difference between skip and discard
    325             AllowYield = 1 << 21,
    326             LoadingExtScript = 1 << 22,
    327             ForceSynchronous = 1 << 23,
    328             InIFrame = 1 << 24,
    329             InDoctype = 1 << 25
    330         };
    331 
    332         void setBit(StateBits bit, bool value)
    333         {
    334             if (value)
    335                 m_bits |= bit;
    336             else
    337                 m_bits &= ~bit;
    338         }
    339         bool testBit(StateBits bit) const { return m_bits & bit; }
    340 
    341         unsigned m_bits;
    342     };
    343 
    344     State m_state;
    345 
    346     DoctypeToken m_doctypeToken;
    347     int m_doctypeSearchCount;
    348     int m_doctypeSecondarySearchCount;
    349 
    350     bool m_brokenServer;
    351 
    352     // Name of an attribute that we just scanned.
    353     AtomicString m_attrName;
    354 
    355     // Used to store the code of a scripting sequence
    356     UChar* m_scriptCode;
    357     // Size of the script sequenze stored in @ref #scriptCode
    358     int m_scriptCodeSize;
    359     // Maximal size that can be stored in @ref #scriptCode
    360     int m_scriptCodeCapacity;
    361     // resync point of script code size
    362     int m_scriptCodeResync;
    363 
    364     // Stores characters if we are scanning for a string like "</script>"
    365     UChar searchBuffer[10];
    366 
    367     // Counts where we are in the string we are scanning for
    368     int searchCount;
    369     // the stopper string
    370     const char* m_searchStopper;
    371     int m_searchStopperLength;
    372 
    373     // if no more data is coming, just parse what we have (including ext scripts that
    374     // may be still downloading) and finish
    375     bool m_noMoreData;
    376     // URL to get source code of script from
    377     String m_scriptTagSrcAttrValue;
    378     String m_scriptTagCharsetAttrValue;
    379     // the HTML code we will parse after the external script we are waiting for has loaded
    380     SegmentedString m_pendingSrc;
    381 
    382     // the HTML code we will parse after this particular script has
    383     // loaded, but before all pending HTML
    384     SegmentedString* m_currentPrependingSrc;
    385 
    386     // true if we are executing a script while parsing a document. This causes the parsing of
    387     // the output of the script to be postponed until after the script has finished executing
    388     int m_executingScript;
    389     Deque<CachedResourceHandle<CachedScript> > m_pendingScripts;
    390     RefPtr<HTMLScriptElement> m_scriptNode;
    391 
    392     bool m_requestingScript;
    393     bool m_hasScriptsWaitingForStylesheets;
    394 
    395     // if we found one broken comment, there are most likely others as well
    396     // store a flag to get rid of the O(n^2) behaviour in such a case.
    397     bool m_brokenComments;
    398     // current line number
    399     int m_lineNumber;
    400     int m_currentScriptTagStartLineNumber;
    401     int m_currentTagStartLineNumber;
    402 
    403     double m_tokenizerTimeDelay;
    404     int m_tokenizerChunkSize;
    405 
    406     // The timer for continued processing.
    407     Timer<HTMLTokenizer> m_timer;
    408 
    409     // The timer for continued executing external scripts.
    410     Timer<HTMLTokenizer> m_externalScriptsTimer;
    411 
    412 // This buffer can hold arbitrarily long user-defined attribute names, such as in EMBED tags.
    413 // So any fixed number might be too small, but rather than rewriting all usage of this buffer
    414 // we'll just make it large enough to handle all imaginable cases.
    415 #define CBUFLEN 1024
    416     UChar m_cBuffer[CBUFLEN + 2];
    417     unsigned int m_cBufferPos;
    418 
    419     SegmentedString m_src;
    420     Document* m_doc;
    421     OwnPtr<HTMLParser> m_parser;
    422     bool m_inWrite;
    423     bool m_fragment;
    424     FragmentScriptingPermission m_scriptingPermission;
    425 
    426     OwnPtr<PreloadScanner> m_preloadScanner;
    427 };
    428 
    429 void parseHTMLDocumentFragment(const String&, DocumentFragment*, FragmentScriptingPermission = FragmentScriptingAllowed);
    430 
    431 UChar decodeNamedEntity(const char*);
    432 
    433 } // namespace WebCore
    434 
    435 #endif // HTMLTokenizer_h
    436