Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2013 Google, Inc. All Rights Reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #ifndef HTMLToken_h
     27 #define HTMLToken_h
     28 
     29 #include "core/dom/Attribute.h"
     30 #include "core/html/parser/HTMLToken.h"
     31 #include "wtf/PassOwnPtr.h"
     32 #include "wtf/RefCounted.h"
     33 #include "wtf/RefPtr.h"
     34 
     35 namespace WebCore {
     36 
     37 class DoctypeData {
     38     WTF_MAKE_NONCOPYABLE(DoctypeData);
     39 public:
     40     DoctypeData()
     41         : m_hasPublicIdentifier(false)
     42         , m_hasSystemIdentifier(false)
     43         , m_forceQuirks(false)
     44     {
     45     }
     46 
     47     // FIXME: This should use String instead of Vector<UChar>.
     48     bool m_hasPublicIdentifier;
     49     bool m_hasSystemIdentifier;
     50     WTF::Vector<UChar> m_publicIdentifier;
     51     WTF::Vector<UChar> m_systemIdentifier;
     52     bool m_forceQuirks;
     53 };
     54 
     55 static inline Attribute* findAttributeInVector(Vector<Attribute>& attributes, const QualifiedName& name)
     56 {
     57     for (unsigned i = 0; i < attributes.size(); ++i) {
     58         if (attributes.at(i).name().matches(name))
     59             return &attributes.at(i);
     60     }
     61     return 0;
     62 }
     63 
     64 class HTMLToken {
     65     WTF_MAKE_NONCOPYABLE(HTMLToken);
     66     WTF_MAKE_FAST_ALLOCATED;
     67 public:
     68     enum Type {
     69         Uninitialized,
     70         DOCTYPE,
     71         StartTag,
     72         EndTag,
     73         Comment,
     74         Character,
     75         EndOfFile,
     76     };
     77 
     78     class Attribute {
     79     public:
     80         class Range {
     81         public:
     82             int start;
     83             int end;
     84         };
     85 
     86         Range nameRange;
     87         Range valueRange;
     88         Vector<UChar, 32> name;
     89         Vector<UChar, 32> value;
     90     };
     91 
     92     typedef Vector<Attribute, 10> AttributeList;
     93 
     94     // By using an inline capacity of 256, we avoid spilling over into an malloced buffer
     95     // approximately 99% of the time based on a non-scientific browse around a number of
     96     // popular web sites on 23 May 2013.
     97     typedef Vector<UChar, 256> DataVector;
     98 
     99     HTMLToken() { clear(); }
    100 
    101     void clear()
    102     {
    103         m_type = Uninitialized;
    104         m_range.start = 0;
    105         m_range.end = 0;
    106         m_baseOffset = 0;
    107         m_data.clear();
    108         m_orAllData = 0;
    109     }
    110 
    111     bool isUninitialized() { return m_type == Uninitialized; }
    112     Type type() const { return m_type; }
    113 
    114     void makeEndOfFile()
    115     {
    116         ASSERT(m_type == Uninitialized);
    117         m_type = EndOfFile;
    118     }
    119 
    120     /* Range and offset methods exposed for HTMLSourceTracker and HTMLViewSourceParser */
    121     int startIndex() const { return m_range.start; }
    122     int endIndex() const { return m_range.end; }
    123 
    124     void setBaseOffset(int offset)
    125     {
    126         m_baseOffset = offset;
    127     }
    128 
    129     void end(int endOffset)
    130     {
    131         m_range.end = endOffset - m_baseOffset;
    132     }
    133 
    134     const DataVector& data() const
    135     {
    136         ASSERT(m_type == Character || m_type == Comment || m_type == StartTag || m_type == EndTag);
    137         return m_data;
    138     }
    139 
    140     bool isAll8BitData() const
    141     {
    142         return (m_orAllData <= 0xff);
    143     }
    144 
    145     const DataVector& name() const
    146     {
    147         ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
    148         return m_data;
    149     }
    150 
    151     void appendToName(UChar character)
    152     {
    153         ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
    154         ASSERT(character);
    155         m_data.append(character);
    156         m_orAllData |= character;
    157     }
    158 
    159     /* DOCTYPE Tokens */
    160 
    161     bool forceQuirks() const
    162     {
    163         ASSERT(m_type == DOCTYPE);
    164         return m_doctypeData->m_forceQuirks;
    165     }
    166 
    167     void setForceQuirks()
    168     {
    169         ASSERT(m_type == DOCTYPE);
    170         m_doctypeData->m_forceQuirks = true;
    171     }
    172 
    173     void beginDOCTYPE()
    174     {
    175         ASSERT(m_type == Uninitialized);
    176         m_type = DOCTYPE;
    177         m_doctypeData = adoptPtr(new DoctypeData);
    178     }
    179 
    180     void beginDOCTYPE(UChar character)
    181     {
    182         ASSERT(character);
    183         beginDOCTYPE();
    184         m_data.append(character);
    185         m_orAllData |= character;
    186     }
    187 
    188     // FIXME: Distinguish between a missing public identifer and an empty one.
    189     const WTF::Vector<UChar>& publicIdentifier() const
    190     {
    191         ASSERT(m_type == DOCTYPE);
    192         return m_doctypeData->m_publicIdentifier;
    193     }
    194 
    195     // FIXME: Distinguish between a missing system identifer and an empty one.
    196     const WTF::Vector<UChar>& systemIdentifier() const
    197     {
    198         ASSERT(m_type == DOCTYPE);
    199         return m_doctypeData->m_systemIdentifier;
    200     }
    201 
    202     void setPublicIdentifierToEmptyString()
    203     {
    204         ASSERT(m_type == DOCTYPE);
    205         m_doctypeData->m_hasPublicIdentifier = true;
    206         m_doctypeData->m_publicIdentifier.clear();
    207     }
    208 
    209     void setSystemIdentifierToEmptyString()
    210     {
    211         ASSERT(m_type == DOCTYPE);
    212         m_doctypeData->m_hasSystemIdentifier = true;
    213         m_doctypeData->m_systemIdentifier.clear();
    214     }
    215 
    216     void appendToPublicIdentifier(UChar character)
    217     {
    218         ASSERT(character);
    219         ASSERT(m_type == DOCTYPE);
    220         ASSERT(m_doctypeData->m_hasPublicIdentifier);
    221         m_doctypeData->m_publicIdentifier.append(character);
    222     }
    223 
    224     void appendToSystemIdentifier(UChar character)
    225     {
    226         ASSERT(character);
    227         ASSERT(m_type == DOCTYPE);
    228         ASSERT(m_doctypeData->m_hasSystemIdentifier);
    229         m_doctypeData->m_systemIdentifier.append(character);
    230     }
    231 
    232     PassOwnPtr<DoctypeData> releaseDoctypeData()
    233     {
    234         return m_doctypeData.release();
    235     }
    236 
    237     /* Start/End Tag Tokens */
    238 
    239     bool selfClosing() const
    240     {
    241         ASSERT(m_type == StartTag || m_type == EndTag);
    242         return m_selfClosing;
    243     }
    244 
    245     void setSelfClosing()
    246     {
    247         ASSERT(m_type == StartTag || m_type == EndTag);
    248         m_selfClosing = true;
    249     }
    250 
    251     void beginStartTag(UChar character)
    252     {
    253         ASSERT(character);
    254         ASSERT(m_type == Uninitialized);
    255         m_type = StartTag;
    256         m_selfClosing = false;
    257         m_currentAttribute = 0;
    258         m_attributes.clear();
    259 
    260         m_data.append(character);
    261         m_orAllData |= character;
    262     }
    263 
    264     void beginEndTag(LChar character)
    265     {
    266         ASSERT(m_type == Uninitialized);
    267         m_type = EndTag;
    268         m_selfClosing = false;
    269         m_currentAttribute = 0;
    270         m_attributes.clear();
    271 
    272         m_data.append(character);
    273     }
    274 
    275     void beginEndTag(const Vector<LChar, 32>& characters)
    276     {
    277         ASSERT(m_type == Uninitialized);
    278         m_type = EndTag;
    279         m_selfClosing = false;
    280         m_currentAttribute = 0;
    281         m_attributes.clear();
    282 
    283         m_data.appendVector(characters);
    284     }
    285 
    286     void addNewAttribute()
    287     {
    288         ASSERT(m_type == StartTag || m_type == EndTag);
    289         m_attributes.grow(m_attributes.size() + 1);
    290         m_currentAttribute = &m_attributes.last();
    291 #ifndef NDEBUG
    292         m_currentAttribute->nameRange.start = 0;
    293         m_currentAttribute->nameRange.end = 0;
    294         m_currentAttribute->valueRange.start = 0;
    295         m_currentAttribute->valueRange.end = 0;
    296 #endif
    297     }
    298 
    299     void beginAttributeName(int offset)
    300     {
    301         m_currentAttribute->nameRange.start = offset - m_baseOffset;
    302     }
    303 
    304     void endAttributeName(int offset)
    305     {
    306         int index = offset - m_baseOffset;
    307         m_currentAttribute->nameRange.end = index;
    308         m_currentAttribute->valueRange.start = index;
    309         m_currentAttribute->valueRange.end = index;
    310     }
    311 
    312     void beginAttributeValue(int offset)
    313     {
    314         m_currentAttribute->valueRange.start = offset - m_baseOffset;
    315 #ifndef NDEBUG
    316         m_currentAttribute->valueRange.end = 0;
    317 #endif
    318     }
    319 
    320     void endAttributeValue(int offset)
    321     {
    322         m_currentAttribute->valueRange.end = offset - m_baseOffset;
    323     }
    324 
    325     void appendToAttributeName(UChar character)
    326     {
    327         ASSERT(character);
    328         ASSERT(m_type == StartTag || m_type == EndTag);
    329         // FIXME: We should be able to add the following ASSERT once we fix
    330         // https://bugs.webkit.org/show_bug.cgi?id=62971
    331         //   ASSERT(m_currentAttribute->nameRange.start);
    332         m_currentAttribute->name.append(character);
    333     }
    334 
    335     void appendToAttributeValue(UChar character)
    336     {
    337         ASSERT(character);
    338         ASSERT(m_type == StartTag || m_type == EndTag);
    339         ASSERT(m_currentAttribute->valueRange.start);
    340         m_currentAttribute->value.append(character);
    341     }
    342 
    343     void appendToAttributeValue(size_t i, const String& value)
    344     {
    345         ASSERT(!value.isEmpty());
    346         ASSERT(m_type == StartTag || m_type == EndTag);
    347         append(m_attributes[i].value, value);
    348     }
    349 
    350     const AttributeList& attributes() const
    351     {
    352         ASSERT(m_type == StartTag || m_type == EndTag);
    353         return m_attributes;
    354     }
    355 
    356     const Attribute* getAttributeItem(const QualifiedName& name) const
    357     {
    358         for (unsigned i = 0; i < m_attributes.size(); ++i) {
    359             if (AtomicString(m_attributes.at(i).name) == name.localName())
    360                 return &m_attributes.at(i);
    361         }
    362         return 0;
    363     }
    364 
    365     // Used by the XSSAuditor to nuke XSS-laden attributes.
    366     void eraseValueOfAttribute(size_t i)
    367     {
    368         ASSERT(m_type == StartTag || m_type == EndTag);
    369         m_attributes[i].value.clear();
    370     }
    371 
    372     /* Character Tokens */
    373 
    374     // Starting a character token works slightly differently than starting
    375     // other types of tokens because we want to save a per-character branch.
    376     void ensureIsCharacterToken()
    377     {
    378         ASSERT(m_type == Uninitialized || m_type == Character);
    379         m_type = Character;
    380     }
    381 
    382     const DataVector& characters() const
    383     {
    384         ASSERT(m_type == Character);
    385         return m_data;
    386     }
    387 
    388     void appendToCharacter(char character)
    389     {
    390         ASSERT(m_type == Character);
    391         m_data.append(character);
    392     }
    393 
    394     void appendToCharacter(UChar character)
    395     {
    396         ASSERT(m_type == Character);
    397         m_data.append(character);
    398         m_orAllData |= character;
    399     }
    400 
    401     void appendToCharacter(const Vector<LChar, 32>& characters)
    402     {
    403         ASSERT(m_type == Character);
    404         m_data.appendVector(characters);
    405     }
    406 
    407     /* Comment Tokens */
    408 
    409     const DataVector& comment() const
    410     {
    411         ASSERT(m_type == Comment);
    412         return m_data;
    413     }
    414 
    415     void beginComment()
    416     {
    417         ASSERT(m_type == Uninitialized);
    418         m_type = Comment;
    419     }
    420 
    421     void appendToComment(UChar character)
    422     {
    423         ASSERT(character);
    424         ASSERT(m_type == Comment);
    425         m_data.append(character);
    426         m_orAllData |= character;
    427     }
    428 
    429     void eraseCharacters()
    430     {
    431         ASSERT(m_type == Character);
    432         m_data.clear();
    433         m_orAllData = 0;
    434     }
    435 
    436 private:
    437     Type m_type;
    438     Attribute::Range m_range; // Always starts at zero.
    439     int m_baseOffset;
    440     DataVector m_data;
    441     UChar m_orAllData;
    442 
    443     // For StartTag and EndTag
    444     bool m_selfClosing;
    445     AttributeList m_attributes;
    446 
    447     // A pointer into m_attributes used during lexing.
    448     Attribute* m_currentAttribute;
    449 
    450     // For DOCTYPE
    451     OwnPtr<DoctypeData> m_doctypeData;
    452 };
    453 
    454 }
    455 
    456 #endif
    457