Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2013 Google, Inc. All Rights Reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #ifndef HTMLToken_h
     27 #define HTMLToken_h
     28 
     29 #include "core/dom/Attribute.h"
     30 #include "wtf/PassOwnPtr.h"
     31 #include "wtf/RefCounted.h"
     32 #include "wtf/RefPtr.h"
     33 
     34 namespace blink {
     35 
     36 class DoctypeData {
     37     WTF_MAKE_NONCOPYABLE(DoctypeData);
     38 public:
     39     DoctypeData()
     40         : m_hasPublicIdentifier(false)
     41         , m_hasSystemIdentifier(false)
     42         , m_forceQuirks(false)
     43     {
     44     }
     45 
     46     bool m_hasPublicIdentifier;
     47     bool m_hasSystemIdentifier;
     48     WTF::Vector<UChar> m_publicIdentifier;
     49     WTF::Vector<UChar> m_systemIdentifier;
     50     bool m_forceQuirks;
     51 };
     52 
     53 static inline Attribute* findAttributeInVector(Vector<Attribute>& attributes, const QualifiedName& name)
     54 {
     55     for (unsigned i = 0; i < attributes.size(); ++i) {
     56         if (attributes.at(i).name().matches(name))
     57             return &attributes.at(i);
     58     }
     59     return 0;
     60 }
     61 
     62 class HTMLToken {
     63     WTF_MAKE_NONCOPYABLE(HTMLToken);
     64     WTF_MAKE_FAST_ALLOCATED;
     65 public:
     66     enum Type {
     67         Uninitialized,
     68         DOCTYPE,
     69         StartTag,
     70         EndTag,
     71         Comment,
     72         Character,
     73         EndOfFile,
     74     };
     75 
     76     class Attribute {
     77     public:
     78         class Range {
     79         public:
     80             int start;
     81             int end;
     82         };
     83 
     84         Range nameRange;
     85         Range valueRange;
     86         Vector<UChar, 32> name;
     87         Vector<UChar, 32> value;
     88     };
     89 
     90     typedef Vector<Attribute, 10> AttributeList;
     91 
     92     // By using an inline capacity of 256, we avoid spilling over into an malloced buffer
     93     // approximately 99% of the time based on a non-scientific browse around a number of
     94     // popular web sites on 23 May 2013.
     95     typedef Vector<UChar, 256> DataVector;
     96 
     97     HTMLToken() { clear(); }
     98 
     99     void clear()
    100     {
    101         m_type = Uninitialized;
    102         m_range.start = 0;
    103         m_range.end = 0;
    104         m_baseOffset = 0;
    105         // Don't call Vector::clear() as that would destroy the
    106         // alloced VectorBuffer. If the innerHTML'd content has
    107         // two 257 character text nodes in a row, we'll needlessly
    108         // thrash malloc. When we finally finish the parse the
    109         // HTMLToken will be destroyed and the VectorBuffer released.
    110         m_data.shrink(0);
    111         m_orAllData = 0;
    112     }
    113 
    114     bool isUninitialized() { return m_type == Uninitialized; }
    115     Type type() const { return m_type; }
    116 
    117     void makeEndOfFile()
    118     {
    119         ASSERT(m_type == Uninitialized);
    120         m_type = EndOfFile;
    121     }
    122 
    123     /* Range and offset methods exposed for HTMLSourceTracker and HTMLViewSourceParser */
    124     int startIndex() const { return m_range.start; }
    125     int endIndex() const { return m_range.end; }
    126 
    127     void setBaseOffset(int offset)
    128     {
    129         m_baseOffset = offset;
    130     }
    131 
    132     void end(int endOffset)
    133     {
    134         m_range.end = endOffset - m_baseOffset;
    135     }
    136 
    137     const DataVector& data() const
    138     {
    139         ASSERT(m_type == Character || m_type == Comment || m_type == StartTag || m_type == EndTag);
    140         return m_data;
    141     }
    142 
    143     bool isAll8BitData() const
    144     {
    145         return (m_orAllData <= 0xff);
    146     }
    147 
    148     const DataVector& name() const
    149     {
    150         ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
    151         return m_data;
    152     }
    153 
    154     void appendToName(UChar character)
    155     {
    156         ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
    157         ASSERT(character);
    158         m_data.append(character);
    159         m_orAllData |= character;
    160     }
    161 
    162     /* DOCTYPE Tokens */
    163 
    164     bool forceQuirks() const
    165     {
    166         ASSERT(m_type == DOCTYPE);
    167         return m_doctypeData->m_forceQuirks;
    168     }
    169 
    170     void setForceQuirks()
    171     {
    172         ASSERT(m_type == DOCTYPE);
    173         m_doctypeData->m_forceQuirks = true;
    174     }
    175 
    176     void beginDOCTYPE()
    177     {
    178         ASSERT(m_type == Uninitialized);
    179         m_type = DOCTYPE;
    180         m_doctypeData = adoptPtr(new DoctypeData);
    181     }
    182 
    183     void beginDOCTYPE(UChar character)
    184     {
    185         ASSERT(character);
    186         beginDOCTYPE();
    187         m_data.append(character);
    188         m_orAllData |= character;
    189     }
    190 
    191     // FIXME: Distinguish between a missing public identifer and an empty one.
    192     const WTF::Vector<UChar>& publicIdentifier() const
    193     {
    194         ASSERT(m_type == DOCTYPE);
    195         return m_doctypeData->m_publicIdentifier;
    196     }
    197 
    198     // FIXME: Distinguish between a missing system identifer and an empty one.
    199     const WTF::Vector<UChar>& systemIdentifier() const
    200     {
    201         ASSERT(m_type == DOCTYPE);
    202         return m_doctypeData->m_systemIdentifier;
    203     }
    204 
    205     void setPublicIdentifierToEmptyString()
    206     {
    207         ASSERT(m_type == DOCTYPE);
    208         m_doctypeData->m_hasPublicIdentifier = true;
    209         m_doctypeData->m_publicIdentifier.clear();
    210     }
    211 
    212     void setSystemIdentifierToEmptyString()
    213     {
    214         ASSERT(m_type == DOCTYPE);
    215         m_doctypeData->m_hasSystemIdentifier = true;
    216         m_doctypeData->m_systemIdentifier.clear();
    217     }
    218 
    219     void appendToPublicIdentifier(UChar character)
    220     {
    221         ASSERT(character);
    222         ASSERT(m_type == DOCTYPE);
    223         ASSERT(m_doctypeData->m_hasPublicIdentifier);
    224         m_doctypeData->m_publicIdentifier.append(character);
    225     }
    226 
    227     void appendToSystemIdentifier(UChar character)
    228     {
    229         ASSERT(character);
    230         ASSERT(m_type == DOCTYPE);
    231         ASSERT(m_doctypeData->m_hasSystemIdentifier);
    232         m_doctypeData->m_systemIdentifier.append(character);
    233     }
    234 
    235     PassOwnPtr<DoctypeData> releaseDoctypeData()
    236     {
    237         return m_doctypeData.release();
    238     }
    239 
    240     /* Start/End Tag Tokens */
    241 
    242     bool selfClosing() const
    243     {
    244         ASSERT(m_type == StartTag || m_type == EndTag);
    245         return m_selfClosing;
    246     }
    247 
    248     void setSelfClosing()
    249     {
    250         ASSERT(m_type == StartTag || m_type == EndTag);
    251         m_selfClosing = true;
    252     }
    253 
    254     void beginStartTag(UChar character)
    255     {
    256         ASSERT(character);
    257         ASSERT(m_type == Uninitialized);
    258         m_type = StartTag;
    259         m_selfClosing = false;
    260         m_currentAttribute = 0;
    261         m_attributes.clear();
    262 
    263         m_data.append(character);
    264         m_orAllData |= character;
    265     }
    266 
    267     void beginEndTag(LChar character)
    268     {
    269         ASSERT(m_type == Uninitialized);
    270         m_type = EndTag;
    271         m_selfClosing = false;
    272         m_currentAttribute = 0;
    273         m_attributes.clear();
    274 
    275         m_data.append(character);
    276     }
    277 
    278     void beginEndTag(const Vector<LChar, 32>& characters)
    279     {
    280         ASSERT(m_type == Uninitialized);
    281         m_type = EndTag;
    282         m_selfClosing = false;
    283         m_currentAttribute = 0;
    284         m_attributes.clear();
    285 
    286         m_data.appendVector(characters);
    287     }
    288 
    289     void addNewAttribute()
    290     {
    291         ASSERT(m_type == StartTag || m_type == EndTag);
    292         m_attributes.grow(m_attributes.size() + 1);
    293         m_currentAttribute = &m_attributes.last();
    294 #if ENABLE(ASSERT)
    295         m_currentAttribute->nameRange.start = 0;
    296         m_currentAttribute->nameRange.end = 0;
    297         m_currentAttribute->valueRange.start = 0;
    298         m_currentAttribute->valueRange.end = 0;
    299 #endif
    300     }
    301 
    302     void beginAttributeName(int offset)
    303     {
    304         m_currentAttribute->nameRange.start = offset - m_baseOffset;
    305     }
    306 
    307     void endAttributeName(int offset)
    308     {
    309         int index = offset - m_baseOffset;
    310         m_currentAttribute->nameRange.end = index;
    311         m_currentAttribute->valueRange.start = index;
    312         m_currentAttribute->valueRange.end = index;
    313     }
    314 
    315     void beginAttributeValue(int offset)
    316     {
    317         m_currentAttribute->valueRange.start = offset - m_baseOffset;
    318 #if ENABLE(ASSERT)
    319         m_currentAttribute->valueRange.end = 0;
    320 #endif
    321     }
    322 
    323     void endAttributeValue(int offset)
    324     {
    325         m_currentAttribute->valueRange.end = offset - m_baseOffset;
    326     }
    327 
    328     void appendToAttributeName(UChar character)
    329     {
    330         ASSERT(character);
    331         ASSERT(m_type == StartTag || m_type == EndTag);
    332         ASSERT(m_currentAttribute->nameRange.start);
    333         m_currentAttribute->name.append(character);
    334     }
    335 
    336     void appendToAttributeValue(UChar character)
    337     {
    338         ASSERT(character);
    339         ASSERT(m_type == StartTag || m_type == EndTag);
    340         ASSERT(m_currentAttribute->valueRange.start);
    341         m_currentAttribute->value.append(character);
    342     }
    343 
    344     void appendToAttributeValue(size_t i, const String& value)
    345     {
    346         ASSERT(!value.isEmpty());
    347         ASSERT(m_type == StartTag || m_type == EndTag);
    348         append(m_attributes[i].value, value);
    349     }
    350 
    351     const AttributeList& attributes() const
    352     {
    353         ASSERT(m_type == StartTag || m_type == EndTag);
    354         return m_attributes;
    355     }
    356 
    357     const Attribute* getAttributeItem(const QualifiedName& name) const
    358     {
    359         for (unsigned i = 0; i < m_attributes.size(); ++i) {
    360             if (AtomicString(m_attributes.at(i).name) == name.localName())
    361                 return &m_attributes.at(i);
    362         }
    363         return 0;
    364     }
    365 
    366     // Used by the XSSAuditor to nuke XSS-laden attributes.
    367     void eraseValueOfAttribute(size_t i)
    368     {
    369         ASSERT(m_type == StartTag || m_type == EndTag);
    370         m_attributes[i].value.clear();
    371     }
    372 
    373     /* Character Tokens */
    374 
    375     // Starting a character token works slightly differently than starting
    376     // other types of tokens because we want to save a per-character branch.
    377     void ensureIsCharacterToken()
    378     {
    379         ASSERT(m_type == Uninitialized || m_type == Character);
    380         m_type = Character;
    381     }
    382 
    383     const DataVector& characters() const
    384     {
    385         ASSERT(m_type == Character);
    386         return m_data;
    387     }
    388 
    389     void appendToCharacter(char character)
    390     {
    391         ASSERT(m_type == Character);
    392         m_data.append(character);
    393     }
    394 
    395     void appendToCharacter(UChar character)
    396     {
    397         ASSERT(m_type == Character);
    398         m_data.append(character);
    399         m_orAllData |= character;
    400     }
    401 
    402     void appendToCharacter(const Vector<LChar, 32>& characters)
    403     {
    404         ASSERT(m_type == Character);
    405         m_data.appendVector(characters);
    406     }
    407 
    408     /* Comment Tokens */
    409 
    410     const DataVector& comment() const
    411     {
    412         ASSERT(m_type == Comment);
    413         return m_data;
    414     }
    415 
    416     void beginComment()
    417     {
    418         ASSERT(m_type == Uninitialized);
    419         m_type = Comment;
    420     }
    421 
    422     void appendToComment(UChar character)
    423     {
    424         ASSERT(character);
    425         ASSERT(m_type == Comment);
    426         m_data.append(character);
    427         m_orAllData |= character;
    428     }
    429 
    430     // Only for XSSAuditor
    431     void eraseCharacters()
    432     {
    433         ASSERT(m_type == Character);
    434         m_data.clear();
    435         m_orAllData = 0;
    436     }
    437 
    438 private:
    439     Type m_type;
    440     Attribute::Range m_range; // Always starts at zero.
    441     int m_baseOffset;
    442     DataVector m_data;
    443     UChar m_orAllData;
    444 
    445     // For StartTag and EndTag
    446     bool m_selfClosing;
    447     AttributeList m_attributes;
    448 
    449     // A pointer into m_attributes used during lexing.
    450     Attribute* m_currentAttribute;
    451 
    452     // For DOCTYPE
    453     OwnPtr<DoctypeData> m_doctypeData;
    454 };
    455 
    456 }
    457 
    458 #endif
    459