Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #ifndef HTMLToken_h
     27 #define HTMLToken_h
     28 
     29 #include "NamedNodeMap.h"
     30 #include <wtf/PassOwnPtr.h>
     31 #include <wtf/Vector.h>
     32 
     33 namespace WebCore {
     34 
     35 class HTMLToken {
     36     WTF_MAKE_NONCOPYABLE(HTMLToken); WTF_MAKE_FAST_ALLOCATED;
     37 public:
     38     enum Type {
     39         Uninitialized,
     40         DOCTYPE,
     41         StartTag,
     42         EndTag,
     43         Comment,
     44         Character,
     45         EndOfFile,
     46     };
     47 
     48     class Range {
     49     public:
     50         int m_start;
     51         int m_end;
     52     };
     53 
     54     class Attribute {
     55     public:
     56         Range m_nameRange;
     57         Range m_valueRange;
     58         WTF::Vector<UChar, 32> m_name;
     59         WTF::Vector<UChar, 32> m_value;
     60     };
     61 
     62     typedef WTF::Vector<Attribute, 10> AttributeList;
     63     typedef WTF::Vector<UChar, 1024> DataVector;
     64 
     65     HTMLToken() { clear(); }
     66 
     67     void clear()
     68     {
     69         m_type = Uninitialized;
     70         m_range.m_start = 0;
     71         m_range.m_end = 0;
     72         m_baseOffset = 0;
     73         m_data.clear();
     74     }
     75 
     76     bool isUninitialized() { return m_type == Uninitialized; }
     77 
     78     int startIndex() const { return m_range.m_start; }
     79     int endIndex() const { return m_range.m_end; }
     80 
     81     void setBaseOffset(int offset)
     82     {
     83         m_baseOffset = offset;
     84     }
     85 
     86     void end(int endOffset)
     87     {
     88         m_range.m_end = endOffset - m_baseOffset;
     89     }
     90 
     91     void makeEndOfFile()
     92     {
     93         ASSERT(m_type == Uninitialized);
     94         m_type = EndOfFile;
     95     }
     96 
     97     void beginStartTag(UChar character)
     98     {
     99         ASSERT(character);
    100         ASSERT(m_type == Uninitialized);
    101         m_type = StartTag;
    102         m_selfClosing = false;
    103         m_currentAttribute = 0;
    104         m_attributes.clear();
    105 
    106         m_data.append(character);
    107     }
    108 
    109     template<typename T>
    110     void beginEndTag(T characters)
    111     {
    112         ASSERT(m_type == Uninitialized);
    113         m_type = EndTag;
    114         m_selfClosing = false;
    115         m_currentAttribute = 0;
    116         m_attributes.clear();
    117 
    118         m_data.append(characters);
    119     }
    120 
    121     // Starting a character token works slightly differently than starting
    122     // other types of tokens because we want to save a per-character branch.
    123     void ensureIsCharacterToken()
    124     {
    125         ASSERT(m_type == Uninitialized || m_type == Character);
    126         m_type = Character;
    127     }
    128 
    129     void beginComment()
    130     {
    131         ASSERT(m_type == Uninitialized);
    132         m_type = Comment;
    133     }
    134 
    135     void beginDOCTYPE()
    136     {
    137         ASSERT(m_type == Uninitialized);
    138         m_type = DOCTYPE;
    139         m_doctypeData = adoptPtr(new DoctypeData());
    140     }
    141 
    142     void beginDOCTYPE(UChar character)
    143     {
    144         ASSERT(character);
    145         beginDOCTYPE();
    146         m_data.append(character);
    147     }
    148 
    149     void appendToName(UChar character)
    150     {
    151         ASSERT(character);
    152         ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
    153         m_data.append(character);
    154     }
    155 
    156     template<typename T>
    157     void appendToCharacter(T characters)
    158     {
    159         ASSERT(m_type == Character);
    160         m_data.append(characters);
    161     }
    162 
    163     void appendToComment(UChar character)
    164     {
    165         ASSERT(character);
    166         ASSERT(m_type == Comment);
    167         m_data.append(character);
    168     }
    169 
    170     void addNewAttribute()
    171     {
    172         ASSERT(m_type == StartTag || m_type == EndTag);
    173         m_attributes.grow(m_attributes.size() + 1);
    174         m_currentAttribute = &m_attributes.last();
    175 #ifndef NDEBUG
    176         m_currentAttribute->m_nameRange.m_start = 0;
    177         m_currentAttribute->m_nameRange.m_end = 0;
    178         m_currentAttribute->m_valueRange.m_start = 0;
    179         m_currentAttribute->m_valueRange.m_end = 0;
    180 #endif
    181     }
    182 
    183     void beginAttributeName(int offset)
    184     {
    185         m_currentAttribute->m_nameRange.m_start = offset - m_baseOffset;
    186     }
    187 
    188     void endAttributeName(int offset)
    189     {
    190         int index = offset - m_baseOffset;
    191         m_currentAttribute->m_nameRange.m_end = index;
    192         m_currentAttribute->m_valueRange.m_start = index;
    193         m_currentAttribute->m_valueRange.m_end = index;
    194     }
    195 
    196     void beginAttributeValue(int offset)
    197     {
    198         m_currentAttribute->m_valueRange.m_start = offset - m_baseOffset;
    199 #ifndef NDEBUG
    200         m_currentAttribute->m_valueRange.m_end = 0;
    201 #endif
    202     }
    203 
    204     void endAttributeValue(int offset)
    205     {
    206         m_currentAttribute->m_valueRange.m_end = offset - m_baseOffset;
    207     }
    208 
    209     void appendToAttributeName(UChar character)
    210     {
    211         ASSERT(character);
    212         ASSERT(m_type == StartTag || m_type == EndTag);
    213         ASSERT(m_currentAttribute->m_nameRange.m_start);
    214         m_currentAttribute->m_name.append(character);
    215     }
    216 
    217     void appendToAttributeValue(UChar character)
    218     {
    219         ASSERT(character);
    220         ASSERT(m_type == StartTag || m_type == EndTag);
    221         ASSERT(m_currentAttribute->m_valueRange.m_start);
    222         m_currentAttribute->m_value.append(character);
    223     }
    224 
    225     void appendToAttributeValue(size_t i, const String& value)
    226     {
    227         ASSERT(!value.isEmpty());
    228         ASSERT(m_type == StartTag || m_type == EndTag);
    229         m_attributes[i].m_value.append(value.characters(), value.length());
    230     }
    231 
    232     Type type() const { return m_type; }
    233 
    234     bool selfClosing() const
    235     {
    236         ASSERT(m_type == StartTag || m_type == EndTag);
    237         return m_selfClosing;
    238     }
    239 
    240     void setSelfClosing()
    241     {
    242         ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag);
    243         m_selfClosing = true;
    244     }
    245 
    246     const AttributeList& attributes() const
    247     {
    248         ASSERT(m_type == StartTag || m_type == EndTag);
    249         return m_attributes;
    250     }
    251 
    252     const DataVector& name() const
    253     {
    254         ASSERT(m_type == StartTag || m_type == EndTag || m_type == DOCTYPE);
    255         return m_data;
    256     }
    257 
    258     void eraseCharacters()
    259     {
    260         ASSERT(m_type == Character);
    261         m_data.clear();
    262     }
    263 
    264     void eraseValueOfAttribute(size_t i)
    265     {
    266         ASSERT(m_type == StartTag || m_type == EndTag);
    267         m_attributes[i].m_value.clear();
    268     }
    269 
    270     const DataVector& characters() const
    271     {
    272         ASSERT(m_type == Character);
    273         return m_data;
    274     }
    275 
    276     const DataVector& comment() const
    277     {
    278         ASSERT(m_type == Comment);
    279         return m_data;
    280     }
    281 
    282     // FIXME: Distinguish between a missing public identifer and an empty one.
    283     const WTF::Vector<UChar>& publicIdentifier() const
    284     {
    285         ASSERT(m_type == DOCTYPE);
    286         return m_doctypeData->m_publicIdentifier;
    287     }
    288 
    289     // FIXME: Distinguish between a missing system identifer and an empty one.
    290     const WTF::Vector<UChar>& systemIdentifier() const
    291     {
    292         ASSERT(m_type == DOCTYPE);
    293         return m_doctypeData->m_systemIdentifier;
    294     }
    295 
    296     void setPublicIdentifierToEmptyString()
    297     {
    298         ASSERT(m_type == DOCTYPE);
    299         m_doctypeData->m_hasPublicIdentifier = true;
    300         m_doctypeData->m_publicIdentifier.clear();
    301     }
    302 
    303     void setSystemIdentifierToEmptyString()
    304     {
    305         ASSERT(m_type == DOCTYPE);
    306         m_doctypeData->m_hasSystemIdentifier = true;
    307         m_doctypeData->m_systemIdentifier.clear();
    308     }
    309 
    310     bool forceQuirks() const
    311     {
    312         ASSERT(m_type == DOCTYPE);
    313         return m_doctypeData->m_forceQuirks;
    314     }
    315 
    316     void setForceQuirks()
    317     {
    318         ASSERT(m_type == DOCTYPE);
    319         m_doctypeData->m_forceQuirks = true;
    320     }
    321 
    322     void appendToPublicIdentifier(UChar character)
    323     {
    324         ASSERT(character);
    325         ASSERT(m_type == DOCTYPE);
    326         ASSERT(m_doctypeData->m_hasPublicIdentifier);
    327         m_doctypeData->m_publicIdentifier.append(character);
    328     }
    329 
    330     void appendToSystemIdentifier(UChar character)
    331     {
    332         ASSERT(character);
    333         ASSERT(m_type == DOCTYPE);
    334         ASSERT(m_doctypeData->m_hasSystemIdentifier);
    335         m_doctypeData->m_systemIdentifier.append(character);
    336     }
    337 
    338 private:
    339     // FIXME: I'm not sure what the final relationship between HTMLToken and
    340     // AtomicHTMLToken will be.  I'm marking this a friend for now, but we'll
    341     // want to end up with a cleaner interface between the two classes.
    342     friend class AtomicHTMLToken;
    343 
    344     class DoctypeData {
    345         WTF_MAKE_NONCOPYABLE(DoctypeData);
    346     public:
    347         DoctypeData()
    348             : m_hasPublicIdentifier(false)
    349             , m_hasSystemIdentifier(false)
    350             , m_forceQuirks(false)
    351         {
    352         }
    353 
    354         bool m_hasPublicIdentifier;
    355         bool m_hasSystemIdentifier;
    356         bool m_forceQuirks;
    357         WTF::Vector<UChar> m_publicIdentifier;
    358         WTF::Vector<UChar> m_systemIdentifier;
    359     };
    360 
    361     Type m_type;
    362     Range m_range; // Always starts at zero.
    363     int m_baseOffset;
    364 
    365     // "name" for DOCTYPE, StartTag, and EndTag
    366     // "characters" for Character
    367     // "data" for Comment
    368     DataVector m_data;
    369 
    370     // For DOCTYPE
    371     OwnPtr<DoctypeData> m_doctypeData;
    372 
    373     // For StartTag and EndTag
    374     bool m_selfClosing;
    375     AttributeList m_attributes;
    376 
    377     // A pointer into m_attributes used during lexing.
    378     Attribute* m_currentAttribute;
    379 };
    380 
    381 // FIXME: This class should eventually be named HTMLToken once we move the
    382 // exiting HTMLToken to be internal to the HTMLTokenizer.
    383 class AtomicHTMLToken {
    384     WTF_MAKE_NONCOPYABLE(AtomicHTMLToken);
    385 public:
    386     AtomicHTMLToken(HTMLToken& token)
    387         : m_type(token.type())
    388     {
    389         switch (m_type) {
    390         case HTMLToken::Uninitialized:
    391             ASSERT_NOT_REACHED();
    392             break;
    393         case HTMLToken::DOCTYPE:
    394             m_name = AtomicString(token.name().data(), token.name().size());
    395             m_doctypeData = token.m_doctypeData.release();
    396             break;
    397         case HTMLToken::EndOfFile:
    398             break;
    399         case HTMLToken::StartTag:
    400         case HTMLToken::EndTag: {
    401             m_selfClosing = token.selfClosing();
    402             m_name = AtomicString(token.name().data(), token.name().size());
    403             initializeAttributes(token.attributes());
    404             break;
    405         }
    406         case HTMLToken::Comment:
    407             m_data = String(token.comment().data(), token.comment().size());
    408             break;
    409         case HTMLToken::Character:
    410             m_externalCharacters = &token.characters();
    411             break;
    412         }
    413     }
    414 
    415     AtomicHTMLToken(HTMLToken::Type type, AtomicString name, PassRefPtr<NamedNodeMap> attributes = 0)
    416         : m_type(type)
    417         , m_name(name)
    418         , m_attributes(attributes)
    419     {
    420         ASSERT(usesName());
    421     }
    422 
    423     HTMLToken::Type type() const { return m_type; }
    424 
    425     const AtomicString& name() const
    426     {
    427         ASSERT(usesName());
    428         return m_name;
    429     }
    430 
    431     void setName(const AtomicString& name)
    432     {
    433         ASSERT(usesName());
    434         m_name = name;
    435     }
    436 
    437     bool selfClosing() const
    438     {
    439         ASSERT(m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag);
    440         return m_selfClosing;
    441     }
    442 
    443     Attribute* getAttributeItem(const QualifiedName& attributeName)
    444     {
    445         ASSERT(usesAttributes());
    446         if (!m_attributes)
    447             return 0;
    448         return m_attributes->getAttributeItem(attributeName);
    449     }
    450 
    451     NamedNodeMap* attributes() const
    452     {
    453         ASSERT(usesAttributes());
    454         return m_attributes.get();
    455     }
    456 
    457     PassRefPtr<NamedNodeMap> takeAtributes()
    458     {
    459         ASSERT(usesAttributes());
    460         return m_attributes.release();
    461     }
    462 
    463     const HTMLToken::DataVector& characters() const
    464     {
    465         ASSERT(m_type == HTMLToken::Character);
    466         return *m_externalCharacters;
    467     }
    468 
    469     const String& comment() const
    470     {
    471         ASSERT(m_type == HTMLToken::Comment);
    472         return m_data;
    473     }
    474 
    475     // FIXME: Distinguish between a missing public identifer and an empty one.
    476     WTF::Vector<UChar>& publicIdentifier() const
    477     {
    478         ASSERT(m_type == HTMLToken::DOCTYPE);
    479         return m_doctypeData->m_publicIdentifier;
    480     }
    481 
    482     // FIXME: Distinguish between a missing system identifer and an empty one.
    483     WTF::Vector<UChar>& systemIdentifier() const
    484     {
    485         ASSERT(m_type == HTMLToken::DOCTYPE);
    486         return m_doctypeData->m_systemIdentifier;
    487     }
    488 
    489     bool forceQuirks() const
    490     {
    491         ASSERT(m_type == HTMLToken::DOCTYPE);
    492         return m_doctypeData->m_forceQuirks;
    493     }
    494 
    495 private:
    496     HTMLToken::Type m_type;
    497 
    498     void initializeAttributes(const HTMLToken::AttributeList& attributes);
    499 
    500     bool usesName() const
    501     {
    502         return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag || m_type == HTMLToken::DOCTYPE;
    503     }
    504 
    505     bool usesAttributes() const
    506     {
    507         return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
    508     }
    509 
    510     // "name" for DOCTYPE, StartTag, and EndTag
    511     AtomicString m_name;
    512 
    513     // "data" for Comment
    514     String m_data;
    515 
    516     // "characters" for Character
    517     //
    518     // We don't want to copy the the characters out of the HTMLToken, so we
    519     // keep a pointer to its buffer instead.  This buffer is owned by the
    520     // HTMLToken and causes a lifetime dependence between these objects.
    521     //
    522     // FIXME: Add a mechanism for "internalizing" the characters when the
    523     //        HTMLToken is destructed.
    524     const HTMLToken::DataVector* m_externalCharacters;
    525 
    526     // For DOCTYPE
    527     OwnPtr<HTMLToken::DoctypeData> m_doctypeData;
    528 
    529     // For StartTag and EndTag
    530     bool m_selfClosing;
    531 
    532     RefPtr<NamedNodeMap> m_attributes;
    533 };
    534 
    535 inline void AtomicHTMLToken::initializeAttributes(const HTMLToken::AttributeList& attributes)
    536 {
    537     size_t size = attributes.size();
    538     if (!size)
    539         return;
    540 
    541     m_attributes = NamedNodeMap::create();
    542     m_attributes->reserveInitialCapacity(size);
    543     for (size_t i = 0; i < size; ++i) {
    544         const HTMLToken::Attribute& attribute = attributes[i];
    545         if (attribute.m_name.isEmpty())
    546             continue;
    547 
    548         ASSERT(attribute.m_nameRange.m_start);
    549         ASSERT(attribute.m_nameRange.m_end);
    550         ASSERT(attribute.m_valueRange.m_start);
    551         ASSERT(attribute.m_valueRange.m_end);
    552 
    553         String name(attribute.m_name.data(), attribute.m_name.size());
    554         String value(attribute.m_value.data(), attribute.m_value.size());
    555         m_attributes->insertAttribute(Attribute::createMapped(name, value), false);
    556     }
    557 }
    558 
    559 }
    560 
    561 #endif
    562