Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
      3  * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
      4  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  * 1. Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26  */
     27 
     28 #include "config.h"
     29 #include "core/html/parser/HTMLEntityParser.h"
     30 
     31 #include "core/html/parser/HTMLEntitySearch.h"
     32 #include "core/html/parser/HTMLEntityTable.h"
     33 #include "wtf/text/StringBuilder.h"
     34 
     35 using namespace WTF;
     36 
     37 namespace WebCore {
     38 
     39 static const UChar windowsLatin1ExtensionArray[32] = {
     40     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
     41     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
     42     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
     43     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
     44 };
     45 
     46 static bool isAlphaNumeric(UChar cc)
     47 {
     48     return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
     49 }
     50 
     51 static UChar adjustEntity(UChar32 value)
     52 {
     53     if ((value & ~0x1F) != 0x0080)
     54         return value;
     55     return windowsLatin1ExtensionArray[value - 0x80];
     56 }
     57 
     58 static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity)
     59 {
     60     // FIXME: A number of specific entity values generate parse errors.
     61     if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) {
     62         decodedEntity.append(0xFFFD);
     63         return;
     64     }
     65     if (U_IS_BMP(c)) {
     66         decodedEntity.append(adjustEntity(c));
     67         return;
     68     }
     69     decodedEntity.append(c);
     70 }
     71 
     72 static const UChar32 kInvalidUnicode = -1;
     73 
     74 static bool isHexDigit(UChar cc)
     75 {
     76     return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
     77 }
     78 
     79 static UChar asHexDigit(UChar cc)
     80 {
     81     if (cc >= '0' && cc <= '9')
     82       return cc - '0';
     83     if (cc >= 'a' && cc <= 'z')
     84       return 10 + cc - 'a';
     85     if (cc >= 'A' && cc <= 'Z')
     86       return 10 + cc - 'A';
     87     ASSERT_NOT_REACHED();
     88     return 0;
     89 }
     90 
     91 typedef Vector<UChar, 64> ConsumedCharacterBuffer;
     92 
     93 static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer& consumedCharacters)
     94 {
     95     if (consumedCharacters.size() == 1)
     96         source.push(consumedCharacters[0]);
     97     else if (consumedCharacters.size() == 2) {
     98         source.push(consumedCharacters[0]);
     99         source.push(consumedCharacters[1]);
    100     } else
    101         source.prepend(SegmentedString(String(consumedCharacters)));
    102 }
    103 
    104 static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
    105 {
    106     ConsumedCharacterBuffer consumedCharacters;
    107     HTMLEntitySearch entitySearch;
    108     while (!source.isEmpty()) {
    109         cc = source.currentChar();
    110         entitySearch.advance(cc);
    111         if (!entitySearch.isEntityPrefix())
    112             break;
    113         consumedCharacters.append(cc);
    114         source.advanceAndASSERT(cc);
    115     }
    116     notEnoughCharacters = source.isEmpty();
    117     if (notEnoughCharacters) {
    118         // We can't an entity because there might be a longer entity
    119         // that we could match if we had more data.
    120         unconsumeCharacters(source, consumedCharacters);
    121         return false;
    122     }
    123     if (!entitySearch.mostRecentMatch()) {
    124         unconsumeCharacters(source, consumedCharacters);
    125         return false;
    126     }
    127     if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
    128         // We've consumed too many characters. We need to walk the
    129         // source back to the point at which we had consumed an
    130         // actual entity.
    131         unconsumeCharacters(source, consumedCharacters);
    132         consumedCharacters.clear();
    133         const int length = entitySearch.mostRecentMatch()->length;
    134         const UChar* reference = entitySearch.mostRecentMatch()->entity;
    135         for (int i = 0; i < length; ++i) {
    136             cc = source.currentChar();
    137             ASSERT_UNUSED(reference, cc == *reference++);
    138             consumedCharacters.append(cc);
    139             source.advanceAndASSERT(cc);
    140             ASSERT(!source.isEmpty());
    141         }
    142         cc = source.currentChar();
    143     }
    144     if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
    145         || !additionalAllowedCharacter
    146         || !(isAlphaNumeric(cc) || cc == '=')) {
    147         decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);
    148         if (UChar32 second = entitySearch.mostRecentMatch()->secondValue)
    149             decodedEntity.append(second);
    150         return true;
    151     }
    152     unconsumeCharacters(source, consumedCharacters);
    153     return false;
    154 }
    155 
    156 bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
    157 {
    158     ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
    159     ASSERT(!notEnoughCharacters);
    160     ASSERT(decodedEntity.isEmpty());
    161 
    162     enum EntityState {
    163         Initial,
    164         Number,
    165         MaybeHexLowerCaseX,
    166         MaybeHexUpperCaseX,
    167         Hex,
    168         Decimal,
    169         Named
    170     };
    171     EntityState entityState = Initial;
    172     UChar32 result = 0;
    173     ConsumedCharacterBuffer consumedCharacters;
    174 
    175     while (!source.isEmpty()) {
    176         UChar cc = source.currentChar();
    177         switch (entityState) {
    178         case Initial: {
    179             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
    180                 return false;
    181             if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
    182                 return false;
    183             if (cc == '#') {
    184                 entityState = Number;
    185                 break;
    186             }
    187             if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
    188                 entityState = Named;
    189                 continue;
    190             }
    191             return false;
    192         }
    193         case Number: {
    194             if (cc == 'x') {
    195                 entityState = MaybeHexLowerCaseX;
    196                 break;
    197             }
    198             if (cc == 'X') {
    199                 entityState = MaybeHexUpperCaseX;
    200                 break;
    201             }
    202             if (cc >= '0' && cc <= '9') {
    203                 entityState = Decimal;
    204                 continue;
    205             }
    206             source.push('#');
    207             return false;
    208         }
    209         case MaybeHexLowerCaseX: {
    210             if (isHexDigit(cc)) {
    211                 entityState = Hex;
    212                 continue;
    213             }
    214             source.push('#');
    215             source.push('x');
    216             return false;
    217         }
    218         case MaybeHexUpperCaseX: {
    219             if (isHexDigit(cc)) {
    220                 entityState = Hex;
    221                 continue;
    222             }
    223             source.push('#');
    224             source.push('X');
    225             return false;
    226         }
    227         case Hex: {
    228             if (isHexDigit(cc)) {
    229                 if (result != kInvalidUnicode)
    230                     result = result * 16 + asHexDigit(cc);
    231             } else if (cc == ';') {
    232                 source.advanceAndASSERT(cc);
    233                 appendLegalEntityFor(result, decodedEntity);
    234                 return true;
    235             } else {
    236                 appendLegalEntityFor(result, decodedEntity);
    237                 return true;
    238             }
    239             break;
    240         }
    241         case Decimal: {
    242             if (cc >= '0' && cc <= '9') {
    243                 if (result != kInvalidUnicode)
    244                     result = result * 10 + cc - '0';
    245             } else if (cc == ';') {
    246                 source.advanceAndASSERT(cc);
    247                 appendLegalEntityFor(result, decodedEntity);
    248                 return true;
    249             } else {
    250                 appendLegalEntityFor(result, decodedEntity);
    251                 return true;
    252             }
    253             break;
    254         }
    255         case Named: {
    256             return consumeNamedEntity(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter, cc);
    257         }
    258         }
    259 
    260         if (result > UCHAR_MAX_VALUE)
    261             result = kInvalidUnicode;
    262 
    263         consumedCharacters.append(cc);
    264         source.advanceAndASSERT(cc);
    265     }
    266     ASSERT(source.isEmpty());
    267     notEnoughCharacters = true;
    268     unconsumeCharacters(source, consumedCharacters);
    269     return false;
    270 }
    271 
    272 static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
    273 {
    274     if (U_IS_BMP(value)) {
    275         UChar character = static_cast<UChar>(value);
    276         ASSERT(character == value);
    277         result[0] = character;
    278         return 1;
    279     }
    280 
    281     result[0] = U16_LEAD(value);
    282     result[1] = U16_TRAIL(value);
    283     return 2;
    284 }
    285 
    286 size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
    287 {
    288     HTMLEntitySearch search;
    289     while (*name) {
    290         search.advance(*name++);
    291         if (!search.isEntityPrefix())
    292             return 0;
    293     }
    294     search.advance(';');
    295     if (!search.isEntityPrefix())
    296         return 0;
    297 
    298     size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);
    299     if (!search.mostRecentMatch()->secondValue)
    300         return numberOfCodePoints;
    301     return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);
    302 }
    303 
    304 } // namespace WebCore
    305