Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
      3  * Copyright (C) 2011 Apple Inc. All rights reserved.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY GOOGLE INC. ``AS IS'' AND ANY
     15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL GOOGLE INC. OR
     18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 #ifndef HTMLTreeBuilder_h
     28 #define HTMLTreeBuilder_h
     29 
     30 #include "Element.h"
     31 #include "FragmentScriptingPermission.h"
     32 #include "HTMLConstructionSite.h"
     33 #include "HTMLElementStack.h"
     34 #include "HTMLFormattingElementList.h"
     35 #include "HTMLTokenizer.h"
     36 #include <wtf/text/TextPosition.h>
     37 #include <wtf/Noncopyable.h>
     38 #include <wtf/OwnPtr.h>
     39 #include <wtf/PassOwnPtr.h>
     40 #include <wtf/PassRefPtr.h>
     41 #include <wtf/RefPtr.h>
     42 #include <wtf/unicode/Unicode.h>
     43 
     44 namespace WebCore {
     45 
     46 class AtomicHTMLToken;
     47 class Document;
     48 class DocumentFragment;
     49 class Frame;
     50 class HTMLToken;
     51 class HTMLDocument;
     52 class Node;
     53 class HTMLDocumentParser;
     54 
     55 class HTMLTreeBuilder {
     56     WTF_MAKE_NONCOPYABLE(HTMLTreeBuilder); WTF_MAKE_FAST_ALLOCATED;
     57 public:
     58     static PassOwnPtr<HTMLTreeBuilder> create(HTMLDocumentParser* parser, HTMLDocument* document, bool reportErrors, bool usePreHTML5ParserQuirks)
     59     {
     60         return adoptPtr(new HTMLTreeBuilder(parser, document, reportErrors, usePreHTML5ParserQuirks));
     61     }
     62     static PassOwnPtr<HTMLTreeBuilder> create(HTMLDocumentParser* parser, DocumentFragment* fragment, Element* contextElement, FragmentScriptingPermission scriptingPermission, bool usePreHTML5ParserQuirks)
     63     {
     64         return adoptPtr(new HTMLTreeBuilder(parser, fragment, contextElement, scriptingPermission, usePreHTML5ParserQuirks));
     65     }
     66     ~HTMLTreeBuilder();
     67 
     68     bool isParsingFragment() const { return !!m_fragmentContext.fragment(); }
     69 
     70     void detach();
     71 
     72     void setPaused(bool paused) { m_isPaused = paused; }
     73     bool isPaused() const { return m_isPaused; }
     74 
     75     // The token really should be passed as a const& since it's never modified.
     76     void constructTreeFromToken(HTMLToken&);
     77     void constructTreeFromAtomicToken(AtomicHTMLToken&);
     78 
     79     // Must be called when parser is paused before calling the parser again.
     80     PassRefPtr<Element> takeScriptToProcess(TextPosition1& scriptStartPosition);
     81 
     82     // Done, close any open tags, etc.
     83     void finished();
     84 
     85     static bool scriptEnabled(Frame*);
     86     static bool pluginsEnabled(Frame*);
     87 
     88 private:
     89     class FakeInsertionMode;
     90     class ExternalCharacterTokenBuffer;
     91     // Represents HTML5 "insertion mode"
     92     // http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#insertion-mode
     93     enum InsertionMode {
     94         InitialMode,
     95         BeforeHTMLMode,
     96         BeforeHeadMode,
     97         InHeadMode,
     98         InHeadNoscriptMode,
     99         AfterHeadMode,
    100         InBodyMode,
    101         TextMode,
    102         InTableMode,
    103         InTableTextMode,
    104         InCaptionMode,
    105         InColumnGroupMode,
    106         InTableBodyMode,
    107         InRowMode,
    108         InCellMode,
    109         InSelectMode,
    110         InSelectInTableMode,
    111         InForeignContentMode,
    112         AfterBodyMode,
    113         InFramesetMode,
    114         AfterFramesetMode,
    115         AfterAfterBodyMode,
    116         AfterAfterFramesetMode,
    117     };
    118 
    119     HTMLTreeBuilder(HTMLDocumentParser* parser, HTMLDocument*, bool reportErrors, bool usePreHTML5ParserQuirks);
    120     HTMLTreeBuilder(HTMLDocumentParser* parser, DocumentFragment*, Element* contextElement, FragmentScriptingPermission, bool usePreHTML5ParserQuirks);
    121 
    122     void processToken(AtomicHTMLToken&);
    123 
    124     void processDoctypeToken(AtomicHTMLToken&);
    125     void processStartTag(AtomicHTMLToken&);
    126     void processEndTag(AtomicHTMLToken&);
    127     void processComment(AtomicHTMLToken&);
    128     void processCharacter(AtomicHTMLToken&);
    129     void processEndOfFile(AtomicHTMLToken&);
    130 
    131     bool processStartTagForInHead(AtomicHTMLToken&);
    132     void processStartTagForInBody(AtomicHTMLToken&);
    133     void processStartTagForInTable(AtomicHTMLToken&);
    134     void processEndTagForInBody(AtomicHTMLToken&);
    135     void processEndTagForInTable(AtomicHTMLToken&);
    136     void processEndTagForInTableBody(AtomicHTMLToken&);
    137     void processEndTagForInRow(AtomicHTMLToken&);
    138     void processEndTagForInCell(AtomicHTMLToken&);
    139 
    140     void processIsindexStartTagForInBody(AtomicHTMLToken&);
    141     bool processBodyEndTagForInBody(AtomicHTMLToken&);
    142     bool processTableEndTagForInTable();
    143     bool processCaptionEndTagForInCaption();
    144     bool processColgroupEndTagForInColumnGroup();
    145     bool processTrEndTagForInRow();
    146     // FIXME: This function should be inlined into its one call site or it
    147     // needs to assert which tokens it can be called with.
    148     void processAnyOtherEndTagForInBody(AtomicHTMLToken&);
    149 
    150     void processCharacterBuffer(ExternalCharacterTokenBuffer&);
    151 
    152     void processFakeStartTag(const QualifiedName&, PassRefPtr<NamedNodeMap> attributes = 0);
    153     void processFakeEndTag(const QualifiedName&);
    154     void processFakeCharacters(const String&);
    155     void processFakePEndTagIfPInButtonScope();
    156 
    157     void processGenericRCDATAStartTag(AtomicHTMLToken&);
    158     void processGenericRawTextStartTag(AtomicHTMLToken&);
    159     void processScriptStartTag(AtomicHTMLToken&);
    160 
    161     // Default processing for the different insertion modes.
    162     void defaultForInitial();
    163     void defaultForBeforeHTML();
    164     void defaultForBeforeHead();
    165     void defaultForInHead();
    166     void defaultForInHeadNoscript();
    167     void defaultForAfterHead();
    168     void defaultForInTableText();
    169 
    170     void prepareToReprocessToken();
    171 
    172     void reprocessStartTag(AtomicHTMLToken&);
    173     void reprocessEndTag(AtomicHTMLToken&);
    174 
    175     PassRefPtr<NamedNodeMap> attributesForIsindexInput(AtomicHTMLToken&);
    176 
    177     HTMLElementStack::ElementRecord* furthestBlockForFormattingElement(Element*);
    178     void callTheAdoptionAgency(AtomicHTMLToken&);
    179 
    180     void closeTheCell();
    181 
    182     template <bool shouldClose(const ContainerNode*)>
    183     void processCloseWhenNestedTag(AtomicHTMLToken&);
    184 
    185     bool m_framesetOk;
    186 
    187     void parseError(AtomicHTMLToken&);
    188 
    189     InsertionMode insertionMode() const { return m_insertionMode; }
    190     void setInsertionMode(InsertionMode mode)
    191     {
    192         m_insertionMode = mode;
    193         m_isFakeInsertionMode = false;
    194     }
    195 
    196     bool isFakeInsertionMode() { return m_isFakeInsertionMode; }
    197     void setFakeInsertionMode(InsertionMode mode)
    198     {
    199         m_insertionMode = mode;
    200         m_isFakeInsertionMode = true;
    201     }
    202 
    203     void resetInsertionModeAppropriately();
    204 
    205     void processForeignContentUsingInBodyModeAndResetMode(AtomicHTMLToken& token);
    206     void resetForeignInsertionMode();
    207 
    208     class FragmentParsingContext {
    209         WTF_MAKE_NONCOPYABLE(FragmentParsingContext);
    210     public:
    211         FragmentParsingContext();
    212         FragmentParsingContext(DocumentFragment*, Element* contextElement, FragmentScriptingPermission);
    213         ~FragmentParsingContext();
    214 
    215         DocumentFragment* fragment() const { return m_fragment; }
    216         Element* contextElement() const { ASSERT(m_fragment); return m_contextElement; }
    217         FragmentScriptingPermission scriptingPermission() const { ASSERT(m_fragment); return m_scriptingPermission; }
    218 
    219     private:
    220         DocumentFragment* m_fragment;
    221         Element* m_contextElement;
    222 
    223         // FragmentScriptingNotAllowed causes the Parser to remove children
    224         // from <script> tags (so javascript doesn't show up in pastes).
    225         FragmentScriptingPermission m_scriptingPermission;
    226     };
    227 
    228     FragmentParsingContext m_fragmentContext;
    229 
    230     Document* m_document;
    231     HTMLConstructionSite m_tree;
    232 
    233     bool m_reportErrors;
    234     bool m_isPaused;
    235     bool m_isFakeInsertionMode;
    236 
    237     // FIXME: InsertionModes should be a separate object to prevent direct
    238     // manipulation of these variables.  For now, be careful to always use
    239     // setInsertionMode and never set m_insertionMode directly.
    240     InsertionMode m_insertionMode;
    241     InsertionMode m_originalInsertionMode;
    242 
    243     // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#pending-table-character-tokens
    244     Vector<UChar> m_pendingTableCharacters;
    245 
    246     // We access parser because HTML5 spec requires that we be able to change the state of the tokenizer
    247     // from within parser actions. We also need it to track the current position.
    248     HTMLDocumentParser* m_parser;
    249 
    250     RefPtr<Element> m_scriptToProcess; // <script> tag which needs processing before resuming the parser.
    251     TextPosition1 m_scriptToProcessStartPosition; // Starting line number of the script tag needing processing.
    252 
    253     // FIXME: We probably want to remove this member.  Originally, it was
    254     // created to service the legacy tree builder, but it seems to be used for
    255     // some other things now.
    256     TextPosition0 m_lastScriptElementStartPosition;
    257 
    258     bool m_usePreHTML5ParserQuirks;
    259 
    260     bool m_hasPendingForeignInsertionModeSteps;
    261 };
    262 
    263 }
    264 
    265 #endif
    266