Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2013 Google, Inc. All Rights Reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #ifndef HTMLToken_h
     27 #define HTMLToken_h
     28 
     29 #include "core/dom/Attribute.h"
     30 #include "wtf/PassOwnPtr.h"
     31 #include "wtf/RefCounted.h"
     32 #include "wtf/RefPtr.h"
     33 
     34 namespace WebCore {
     35 
     36 class DoctypeData {
     37     WTF_MAKE_NONCOPYABLE(DoctypeData);
     38 public:
     39     DoctypeData()
     40         : m_hasPublicIdentifier(false)
     41         , m_hasSystemIdentifier(false)
     42         , m_forceQuirks(false)
     43     {
     44     }
     45 
     46     // FIXME: This should use String instead of Vector<UChar>.
     47     bool m_hasPublicIdentifier;
     48     bool m_hasSystemIdentifier;
     49     WTF::Vector<UChar> m_publicIdentifier;
     50     WTF::Vector<UChar> m_systemIdentifier;
     51     bool m_forceQuirks;
     52 };
     53 
     54 static inline Attribute* findAttributeInVector(Vector<Attribute>& attributes, const QualifiedName& name)
     55 {
     56     for (unsigned i = 0; i < attributes.size(); ++i) {
     57         if (attributes.at(i).name().matches(name))
     58             return &attributes.at(i);
     59     }
     60     return 0;
     61 }
     62 
     63 class HTMLToken {
     64     WTF_MAKE_NONCOPYABLE(HTMLToken);
     65     WTF_MAKE_FAST_ALLOCATED;
     66 public:
     67     enum Type {
     68         Uninitialized,
     69         DOCTYPE,
     70         StartTag,
     71         EndTag,
     72         Comment,
     73         Character,
     74         EndOfFile,
     75     };
     76 
     77     class Attribute {
     78     public:
     79         class Range {
     80         public:
     81             int start;
     82             int end;
     83         };
     84 
     85         Range nameRange;
     86         Range valueRange;
     87         Vector<UChar, 32> name;
     88         Vector<UChar, 32> value;
     89     };
     90 
     91     typedef Vector<Attribute, 10> AttributeList;
     92 
     93     // By using an inline capacity of 256, we avoid spilling over into an malloced buffer
     94     // approximately 99% of the time based on a non-scientific browse around a number of
     95     // popular web sites on 23 May 2013.
     96     typedef Vector<UChar, 256> DataVector;
     97 
     98     HTMLToken() { clear(); }
     99 
    100     void clear()
    101     {
    102         m_type = Uninitialized;
    103         m_range.start = 0;
    104         m_range.end = 0;
    105         m_baseOffset = 0;
    106         // Don't call Vector::clear() as that would destroy the
    107         // alloced VectorBuffer. If the innerHTML'd content has
    108         // two 257 character text nodes in a row, we'll needlessly
    109         // thrash malloc. When we finally finish the parse the
    110         // HTMLToken will be destroyed and the VectorBuffer released.
    111         m_data.shrink(0);
    112         m_orAllData = 0;
    113     }
    114 
    115     bool isUninitialized() { return m_type == Uninitialized; }
    116     Type type() const { return m_type; }
    117 
    118     void makeEndOfFile()
    119     {
    120         ASSERT(m_type == Uninitialized);
    121         m_type = EndOfFile;
    122     }
    123 
    124     /* Range and offset methods exposed for HTMLSourceTracker and HTMLViewSourceParser */
    125     int startIndex() const { return m_range.start; }
    126     int endIndex() const { return m_range.end; }
    127 
    128     void setBaseOffset(int offset)
    129     {
    130         m_baseOffset = offset;
    131     }
    132 
    133     void end(int endOffset)
    134     {
    135         m_range.end = endOffset - m_baseOffset;
    136     }
    137 
    138     const DataVector& data() const
    139     {
    140         ASSERT(m_type == Character || m_type == Comment || m_type == StartTag || m_type == EndTag);
    141         return m_data;
    142     }
    143 
    144     bool isAll8BitData() const
    145     {
    146         return (m_orAllData <= 0xff);
    147     }
    148 
    149     const DataVector& name() const
    150     {
    151         ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
    152         return m_data;
    153     }
    154 
    155     void appendToName(UChar character)
    156     {
    157         ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
    158         ASSERT(character);
    159         m_data.append(character);
    160         m_orAllData |= character;
    161     }
    162 
    163     /* DOCTYPE Tokens */
    164 
    165     bool forceQuirks() const
    166     {
    167         ASSERT(m_type == DOCTYPE);
    168         return m_doctypeData->m_forceQuirks;
    169     }
    170 
    171     void setForceQuirks()
    172     {
    173         ASSERT(m_type == DOCTYPE);
    174         m_doctypeData->m_forceQuirks = true;
    175     }
    176 
    177     void beginDOCTYPE()
    178     {
    179         ASSERT(m_type == Uninitialized);
    180         m_type = DOCTYPE;
    181         m_doctypeData = adoptPtr(new DoctypeData);
    182     }
    183 
    184     void beginDOCTYPE(UChar character)
    185     {
    186         ASSERT(character);
    187         beginDOCTYPE();
    188         m_data.append(character);
    189         m_orAllData |= character;
    190     }
    191 
    192     // FIXME: Distinguish between a missing public identifer and an empty one.
    193     const WTF::Vector<UChar>& publicIdentifier() const
    194     {
    195         ASSERT(m_type == DOCTYPE);
    196         return m_doctypeData->m_publicIdentifier;
    197     }
    198 
    199     // FIXME: Distinguish between a missing system identifer and an empty one.
    200     const WTF::Vector<UChar>& systemIdentifier() const
    201     {
    202         ASSERT(m_type == DOCTYPE);
    203         return m_doctypeData->m_systemIdentifier;
    204     }
    205 
    206     void setPublicIdentifierToEmptyString()
    207     {
    208         ASSERT(m_type == DOCTYPE);
    209         m_doctypeData->m_hasPublicIdentifier = true;
    210         m_doctypeData->m_publicIdentifier.clear();
    211     }
    212 
    213     void setSystemIdentifierToEmptyString()
    214     {
    215         ASSERT(m_type == DOCTYPE);
    216         m_doctypeData->m_hasSystemIdentifier = true;
    217         m_doctypeData->m_systemIdentifier.clear();
    218     }
    219 
    220     void appendToPublicIdentifier(UChar character)
    221     {
    222         ASSERT(character);
    223         ASSERT(m_type == DOCTYPE);
    224         ASSERT(m_doctypeData->m_hasPublicIdentifier);
    225         m_doctypeData->m_publicIdentifier.append(character);
    226     }
    227 
    228     void appendToSystemIdentifier(UChar character)
    229     {
    230         ASSERT(character);
    231         ASSERT(m_type == DOCTYPE);
    232         ASSERT(m_doctypeData->m_hasSystemIdentifier);
    233         m_doctypeData->m_systemIdentifier.append(character);
    234     }
    235 
    236     PassOwnPtr<DoctypeData> releaseDoctypeData()
    237     {
    238         return m_doctypeData.release();
    239     }
    240 
    241     /* Start/End Tag Tokens */
    242 
    243     bool selfClosing() const
    244     {
    245         ASSERT(m_type == StartTag || m_type == EndTag);
    246         return m_selfClosing;
    247     }
    248 
    249     void setSelfClosing()
    250     {
    251         ASSERT(m_type == StartTag || m_type == EndTag);
    252         m_selfClosing = true;
    253     }
    254 
    255     void beginStartTag(UChar character)
    256     {
    257         ASSERT(character);
    258         ASSERT(m_type == Uninitialized);
    259         m_type = StartTag;
    260         m_selfClosing = false;
    261         m_currentAttribute = 0;
    262         m_attributes.clear();
    263 
    264         m_data.append(character);
    265         m_orAllData |= character;
    266     }
    267 
    268     void beginEndTag(LChar character)
    269     {
    270         ASSERT(m_type == Uninitialized);
    271         m_type = EndTag;
    272         m_selfClosing = false;
    273         m_currentAttribute = 0;
    274         m_attributes.clear();
    275 
    276         m_data.append(character);
    277     }
    278 
    279     void beginEndTag(const Vector<LChar, 32>& characters)
    280     {
    281         ASSERT(m_type == Uninitialized);
    282         m_type = EndTag;
    283         m_selfClosing = false;
    284         m_currentAttribute = 0;
    285         m_attributes.clear();
    286 
    287         m_data.appendVector(characters);
    288     }
    289 
    290     void addNewAttribute()
    291     {
    292         ASSERT(m_type == StartTag || m_type == EndTag);
    293         m_attributes.grow(m_attributes.size() + 1);
    294         m_currentAttribute = &m_attributes.last();
    295 #ifndef NDEBUG
    296         m_currentAttribute->nameRange.start = 0;
    297         m_currentAttribute->nameRange.end = 0;
    298         m_currentAttribute->valueRange.start = 0;
    299         m_currentAttribute->valueRange.end = 0;
    300 #endif
    301     }
    302 
    303     void beginAttributeName(int offset)
    304     {
    305         m_currentAttribute->nameRange.start = offset - m_baseOffset;
    306     }
    307 
    308     void endAttributeName(int offset)
    309     {
    310         int index = offset - m_baseOffset;
    311         m_currentAttribute->nameRange.end = index;
    312         m_currentAttribute->valueRange.start = index;
    313         m_currentAttribute->valueRange.end = index;
    314     }
    315 
    316     void beginAttributeValue(int offset)
    317     {
    318         m_currentAttribute->valueRange.start = offset - m_baseOffset;
    319 #ifndef NDEBUG
    320         m_currentAttribute->valueRange.end = 0;
    321 #endif
    322     }
    323 
    324     void endAttributeValue(int offset)
    325     {
    326         m_currentAttribute->valueRange.end = offset - m_baseOffset;
    327     }
    328 
    329     void appendToAttributeName(UChar character)
    330     {
    331         ASSERT(character);
    332         ASSERT(m_type == StartTag || m_type == EndTag);
    333         ASSERT(m_currentAttribute->nameRange.start);
    334         m_currentAttribute->name.append(character);
    335     }
    336 
    337     void appendToAttributeValue(UChar character)
    338     {
    339         ASSERT(character);
    340         ASSERT(m_type == StartTag || m_type == EndTag);
    341         ASSERT(m_currentAttribute->valueRange.start);
    342         m_currentAttribute->value.append(character);
    343     }
    344 
    345     void appendToAttributeValue(size_t i, const String& value)
    346     {
    347         ASSERT(!value.isEmpty());
    348         ASSERT(m_type == StartTag || m_type == EndTag);
    349         append(m_attributes[i].value, value);
    350     }
    351 
    352     const AttributeList& attributes() const
    353     {
    354         ASSERT(m_type == StartTag || m_type == EndTag);
    355         return m_attributes;
    356     }
    357 
    358     const Attribute* getAttributeItem(const QualifiedName& name) const
    359     {
    360         for (unsigned i = 0; i < m_attributes.size(); ++i) {
    361             if (AtomicString(m_attributes.at(i).name) == name.localName())
    362                 return &m_attributes.at(i);
    363         }
    364         return 0;
    365     }
    366 
    367     // Used by the XSSAuditor to nuke XSS-laden attributes.
    368     void eraseValueOfAttribute(size_t i)
    369     {
    370         ASSERT(m_type == StartTag || m_type == EndTag);
    371         m_attributes[i].value.clear();
    372     }
    373 
    374     /* Character Tokens */
    375 
    376     // Starting a character token works slightly differently than starting
    377     // other types of tokens because we want to save a per-character branch.
    378     void ensureIsCharacterToken()
    379     {
    380         ASSERT(m_type == Uninitialized || m_type == Character);
    381         m_type = Character;
    382     }
    383 
    384     const DataVector& characters() const
    385     {
    386         ASSERT(m_type == Character);
    387         return m_data;
    388     }
    389 
    390     void appendToCharacter(char character)
    391     {
    392         ASSERT(m_type == Character);
    393         m_data.append(character);
    394     }
    395 
    396     void appendToCharacter(UChar character)
    397     {
    398         ASSERT(m_type == Character);
    399         m_data.append(character);
    400         m_orAllData |= character;
    401     }
    402 
    403     void appendToCharacter(const Vector<LChar, 32>& characters)
    404     {
    405         ASSERT(m_type == Character);
    406         m_data.appendVector(characters);
    407     }
    408 
    409     /* Comment Tokens */
    410 
    411     const DataVector& comment() const
    412     {
    413         ASSERT(m_type == Comment);
    414         return m_data;
    415     }
    416 
    417     void beginComment()
    418     {
    419         ASSERT(m_type == Uninitialized);
    420         m_type = Comment;
    421     }
    422 
    423     void appendToComment(UChar character)
    424     {
    425         ASSERT(character);
    426         ASSERT(m_type == Comment);
    427         m_data.append(character);
    428         m_orAllData |= character;
    429     }
    430 
    431     // Only for XSSAuditor
    432     void eraseCharacters()
    433     {
    434         ASSERT(m_type == Character);
    435         m_data.clear();
    436         m_orAllData = 0;
    437     }
    438 
    439 private:
    440     Type m_type;
    441     Attribute::Range m_range; // Always starts at zero.
    442     int m_baseOffset;
    443     DataVector m_data;
    444     UChar m_orAllData;
    445 
    446     // For StartTag and EndTag
    447     bool m_selfClosing;
    448     AttributeList m_attributes;
    449 
    450     // A pointer into m_attributes used during lexing.
    451     Attribute* m_currentAttribute;
    452 
    453     // For DOCTYPE
    454     OwnPtr<DoctypeData> m_doctypeData;
    455 };
    456 
    457 }
    458 
    459 #endif
    460