Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
      3  * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
      4  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  * 1. Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26  */
     27 
     28 #include "config.h"
     29 #include "core/html/parser/HTMLEntityParser.h"
     30 
     31 #include "core/html/parser/HTMLEntitySearch.h"
     32 #include "core/html/parser/HTMLEntityTable.h"
     33 #include "wtf/text/StringBuilder.h"
     34 
     35 using namespace WTF;
     36 
     37 namespace blink {
     38 
     39 static const UChar windowsLatin1ExtensionArray[32] = {
     40     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
     41     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
     42     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
     43     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
     44 };
     45 
     46 static bool isAlphaNumeric(UChar cc)
     47 {
     48     return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
     49 }
     50 
     51 static UChar adjustEntity(UChar32 value)
     52 {
     53     if ((value & ~0x1F) != 0x0080)
     54         return value;
     55     return windowsLatin1ExtensionArray[value - 0x80];
     56 }
     57 
     58 static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity)
     59 {
     60     // FIXME: A number of specific entity values generate parse errors.
     61     if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) {
     62         decodedEntity.append(0xFFFD);
     63         return;
     64     }
     65     if (U_IS_BMP(c)) {
     66         decodedEntity.append(adjustEntity(c));
     67         return;
     68     }
     69     decodedEntity.append(c);
     70 }
     71 
     72 static const UChar32 kInvalidUnicode = -1;
     73 
     74 static bool isHexDigit(UChar cc)
     75 {
     76     return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
     77 }
     78 
     79 static UChar asHexDigit(UChar cc)
     80 {
     81     if (cc >= '0' && cc <= '9')
     82       return cc - '0';
     83     if (cc >= 'a' && cc <= 'z')
     84       return 10 + cc - 'a';
     85     if (cc >= 'A' && cc <= 'Z')
     86       return 10 + cc - 'A';
     87     ASSERT_NOT_REACHED();
     88     return 0;
     89 }
     90 
     91 typedef Vector<UChar, 64> ConsumedCharacterBuffer;
     92 
     93 static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer& consumedCharacters)
     94 {
     95     if (consumedCharacters.size() == 1)
     96         source.push(consumedCharacters[0]);
     97     else if (consumedCharacters.size() == 2) {
     98         source.push(consumedCharacters[0]);
     99         source.push(consumedCharacters[1]);
    100     } else
    101         source.prepend(SegmentedString(String(consumedCharacters)));
    102 }
    103 
    104 static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
    105 {
    106     ConsumedCharacterBuffer consumedCharacters;
    107     HTMLEntitySearch entitySearch;
    108     while (!source.isEmpty()) {
    109         cc = source.currentChar();
    110         entitySearch.advance(cc);
    111         if (!entitySearch.isEntityPrefix())
    112             break;
    113         consumedCharacters.append(cc);
    114         source.advanceAndASSERT(cc);
    115     }
    116     notEnoughCharacters = source.isEmpty();
    117     if (notEnoughCharacters) {
    118         // We can't decide on an entity because there might be a longer entity
    119         // that we could match if we had more data.
    120         unconsumeCharacters(source, consumedCharacters);
    121         return false;
    122     }
    123     if (!entitySearch.mostRecentMatch()) {
    124         unconsumeCharacters(source, consumedCharacters);
    125         return false;
    126     }
    127     if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
    128         // We've consumed too many characters. We need to walk the
    129         // source back to the point at which we had consumed an
    130         // actual entity.
    131         unconsumeCharacters(source, consumedCharacters);
    132         consumedCharacters.clear();
    133         const HTMLEntityTableEntry* mostRecent = entitySearch.mostRecentMatch();
    134         const int length = mostRecent->length;
    135         const LChar* reference = HTMLEntityTable::entityString(*mostRecent);
    136         for (int i = 0; i < length; ++i) {
    137             cc = source.currentChar();
    138             ASSERT_UNUSED(reference, cc == static_cast<UChar>(*reference++));
    139             consumedCharacters.append(cc);
    140             source.advanceAndASSERT(cc);
    141             ASSERT(!source.isEmpty());
    142         }
    143         cc = source.currentChar();
    144     }
    145     if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
    146         || !additionalAllowedCharacter
    147         || !(isAlphaNumeric(cc) || cc == '=')) {
    148         decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);
    149         if (UChar32 second = entitySearch.mostRecentMatch()->secondValue)
    150             decodedEntity.append(second);
    151         return true;
    152     }
    153     unconsumeCharacters(source, consumedCharacters);
    154     return false;
    155 }
    156 
    157 bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
    158 {
    159     ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
    160     ASSERT(!notEnoughCharacters);
    161     ASSERT(decodedEntity.isEmpty());
    162 
    163     enum EntityState {
    164         Initial,
    165         Number,
    166         MaybeHexLowerCaseX,
    167         MaybeHexUpperCaseX,
    168         Hex,
    169         Decimal,
    170         Named
    171     };
    172     EntityState entityState = Initial;
    173     UChar32 result = 0;
    174     ConsumedCharacterBuffer consumedCharacters;
    175 
    176     while (!source.isEmpty()) {
    177         UChar cc = source.currentChar();
    178         switch (entityState) {
    179         case Initial: {
    180             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
    181                 return false;
    182             if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
    183                 return false;
    184             if (cc == '#') {
    185                 entityState = Number;
    186                 break;
    187             }
    188             if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
    189                 entityState = Named;
    190                 continue;
    191             }
    192             return false;
    193         }
    194         case Number: {
    195             if (cc == 'x') {
    196                 entityState = MaybeHexLowerCaseX;
    197                 break;
    198             }
    199             if (cc == 'X') {
    200                 entityState = MaybeHexUpperCaseX;
    201                 break;
    202             }
    203             if (cc >= '0' && cc <= '9') {
    204                 entityState = Decimal;
    205                 continue;
    206             }
    207             source.push('#');
    208             return false;
    209         }
    210         case MaybeHexLowerCaseX: {
    211             if (isHexDigit(cc)) {
    212                 entityState = Hex;
    213                 continue;
    214             }
    215             source.push('#');
    216             source.push('x');
    217             return false;
    218         }
    219         case MaybeHexUpperCaseX: {
    220             if (isHexDigit(cc)) {
    221                 entityState = Hex;
    222                 continue;
    223             }
    224             source.push('#');
    225             source.push('X');
    226             return false;
    227         }
    228         case Hex: {
    229             if (isHexDigit(cc)) {
    230                 if (result != kInvalidUnicode)
    231                     result = result * 16 + asHexDigit(cc);
    232             } else if (cc == ';') {
    233                 source.advanceAndASSERT(cc);
    234                 appendLegalEntityFor(result, decodedEntity);
    235                 return true;
    236             } else {
    237                 appendLegalEntityFor(result, decodedEntity);
    238                 return true;
    239             }
    240             break;
    241         }
    242         case Decimal: {
    243             if (cc >= '0' && cc <= '9') {
    244                 if (result != kInvalidUnicode)
    245                     result = result * 10 + cc - '0';
    246             } else if (cc == ';') {
    247                 source.advanceAndASSERT(cc);
    248                 appendLegalEntityFor(result, decodedEntity);
    249                 return true;
    250             } else {
    251                 appendLegalEntityFor(result, decodedEntity);
    252                 return true;
    253             }
    254             break;
    255         }
    256         case Named: {
    257             return consumeNamedEntity(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter, cc);
    258         }
    259         }
    260 
    261         if (result > UCHAR_MAX_VALUE)
    262             result = kInvalidUnicode;
    263 
    264         consumedCharacters.append(cc);
    265         source.advanceAndASSERT(cc);
    266     }
    267     ASSERT(source.isEmpty());
    268     notEnoughCharacters = true;
    269     unconsumeCharacters(source, consumedCharacters);
    270     return false;
    271 }
    272 
    273 static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
    274 {
    275     if (U_IS_BMP(value)) {
    276         UChar character = static_cast<UChar>(value);
    277         ASSERT(character == value);
    278         result[0] = character;
    279         return 1;
    280     }
    281 
    282     result[0] = U16_LEAD(value);
    283     result[1] = U16_TRAIL(value);
    284     return 2;
    285 }
    286 
    287 size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
    288 {
    289     HTMLEntitySearch search;
    290     while (*name) {
    291         search.advance(*name++);
    292         if (!search.isEntityPrefix())
    293             return 0;
    294     }
    295     search.advance(';');
    296     if (!search.isEntityPrefix())
    297         return 0;
    298 
    299     size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);
    300     if (!search.mostRecentMatch()->secondValue)
    301         return numberOfCodePoints;
    302     return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);
    303 }
    304 
    305 } // namespace blink
    306