Home | History | Annotate | Download | only in parser
      1 /*
      2  * Copyright (C) 2008 Apple Inc. All Rights Reserved.
      3  * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
      4  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  * 1. Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26  */
     27 
     28 #include "config.h"
     29 #include "HTMLEntityParser.h"
     30 
     31 #include "HTMLEntitySearch.h"
     32 #include "HTMLEntityTable.h"
     33 #include <wtf/Vector.h>
     34 
     35 using namespace WTF;
     36 
     37 namespace WebCore {
     38 
     39 namespace {
     40 
     41 static const UChar windowsLatin1ExtensionArray[32] = {
     42     0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
     43     0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
     44     0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
     45     0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
     46 };
     47 
     48 inline UChar adjustEntity(UChar32 value)
     49 {
     50     if ((value & ~0x1F) != 0x0080)
     51         return value;
     52     return windowsLatin1ExtensionArray[value - 0x80];
     53 }
     54 
     55 inline UChar32 legalEntityFor(UChar32 value)
     56 {
     57     // FIXME: A number of specific entity values generate parse errors.
     58     if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))
     59         return 0xFFFD;
     60     if (U_IS_BMP(value))
     61         return adjustEntity(value);
     62     return value;
     63 }
     64 
     65 inline bool convertToUTF16(UChar32 value, Vector<UChar, 16>& decodedEntity)
     66 {
     67     if (U_IS_BMP(value)) {
     68         UChar character = static_cast<UChar>(value);
     69         ASSERT(character == value);
     70         decodedEntity.append(character);
     71         return true;
     72     }
     73     decodedEntity.append(U16_LEAD(value));
     74     decodedEntity.append(U16_TRAIL(value));
     75     return true;
     76 }
     77 
     78 inline bool isHexDigit(UChar cc)
     79 {
     80     return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
     81 }
     82 
     83 inline bool isAlphaNumeric(UChar cc)
     84 {
     85     return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
     86 }
     87 
     88 void unconsumeCharacters(SegmentedString& source, const Vector<UChar, 10>& consumedCharacters)
     89 {
     90     if (consumedCharacters.size() == 1)
     91         source.push(consumedCharacters[0]);
     92     else if (consumedCharacters.size() == 2) {
     93         source.push(consumedCharacters[0]);
     94         source.push(consumedCharacters[1]);
     95     } else
     96         source.prepend(SegmentedString(String(consumedCharacters.data(), consumedCharacters.size())));
     97 }
     98 
     99 }
    100 
    101 bool consumeHTMLEntity(SegmentedString& source, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
    102 {
    103     ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
    104     ASSERT(!notEnoughCharacters);
    105     ASSERT(decodedEntity.isEmpty());
    106 
    107     enum EntityState {
    108         Initial,
    109         Number,
    110         MaybeHexLowerCaseX,
    111         MaybeHexUpperCaseX,
    112         Hex,
    113         Decimal,
    114         Named
    115     };
    116     EntityState entityState = Initial;
    117     UChar32 result = 0;
    118     Vector<UChar, 10> consumedCharacters;
    119 
    120     while (!source.isEmpty()) {
    121         UChar cc = *source;
    122         switch (entityState) {
    123         case Initial: {
    124             if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
    125                 return false;
    126             if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
    127                 return false;
    128             if (cc == '#') {
    129                 entityState = Number;
    130                 break;
    131             }
    132             if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
    133                 entityState = Named;
    134                 continue;
    135             }
    136             return false;
    137         }
    138         case Number: {
    139             if (cc == 'x') {
    140                 entityState = MaybeHexLowerCaseX;
    141                 break;
    142             }
    143             if (cc == 'X') {
    144                 entityState = MaybeHexUpperCaseX;
    145                 break;
    146             }
    147             if (cc >= '0' && cc <= '9') {
    148                 entityState = Decimal;
    149                 continue;
    150             }
    151             source.push('#');
    152             return false;
    153         }
    154         case MaybeHexLowerCaseX: {
    155             if (isHexDigit(cc)) {
    156                 entityState = Hex;
    157                 continue;
    158             }
    159             source.push('#');
    160             source.push('x');
    161             return false;
    162         }
    163         case MaybeHexUpperCaseX: {
    164             if (isHexDigit(cc)) {
    165                 entityState = Hex;
    166                 continue;
    167             }
    168             source.push('#');
    169             source.push('X');
    170             return false;
    171         }
    172         case Hex: {
    173             if (cc >= '0' && cc <= '9')
    174                 result = result * 16 + cc - '0';
    175             else if (cc >= 'a' && cc <= 'f')
    176                 result = result * 16 + 10 + cc - 'a';
    177             else if (cc >= 'A' && cc <= 'F')
    178                 result = result * 16 + 10 + cc - 'A';
    179             else {
    180                 if (cc == ';')
    181                     source.advanceAndASSERT(cc);
    182                 return convertToUTF16(legalEntityFor(result), decodedEntity);
    183             }
    184             break;
    185         }
    186         case Decimal: {
    187             if (cc >= '0' && cc <= '9')
    188                 result = result * 10 + cc - '0';
    189             else {
    190                 if (cc == ';')
    191                     source.advanceAndASSERT(cc);
    192                 return convertToUTF16(legalEntityFor(result), decodedEntity);
    193             }
    194             break;
    195         }
    196         case Named: {
    197             HTMLEntitySearch entitySearch;
    198             while (!source.isEmpty()) {
    199                 cc = *source;
    200                 entitySearch.advance(cc);
    201                 if (!entitySearch.isEntityPrefix())
    202                     break;
    203                 consumedCharacters.append(cc);
    204                 source.advanceAndASSERT(cc);
    205             }
    206             notEnoughCharacters = source.isEmpty();
    207             if (notEnoughCharacters) {
    208                 // We can't an entity because there might be a longer entity
    209                 // that we could match if we had more data.
    210                 unconsumeCharacters(source, consumedCharacters);
    211                 return false;
    212             }
    213             if (!entitySearch.mostRecentMatch()) {
    214                 ASSERT(!entitySearch.currentValue());
    215                 unconsumeCharacters(source, consumedCharacters);
    216                 return false;
    217             }
    218             if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
    219                 // We've consumed too many characters.  We need to walk the
    220                 // source back to the point at which we had consumed an
    221                 // actual entity.
    222                 unconsumeCharacters(source, consumedCharacters);
    223                 consumedCharacters.clear();
    224                 const int length = entitySearch.mostRecentMatch()->length;
    225                 const UChar* reference = entitySearch.mostRecentMatch()->entity;
    226                 for (int i = 0; i < length; ++i) {
    227                     cc = *source;
    228                     ASSERT_UNUSED(reference, cc == *reference++);
    229                     consumedCharacters.append(cc);
    230                     source.advanceAndASSERT(cc);
    231                     ASSERT(!source.isEmpty());
    232                 }
    233                 cc = *source;
    234             }
    235             if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
    236                 || !additionalAllowedCharacter
    237                 || !(isAlphaNumeric(cc) || cc == '=')) {
    238                 return convertToUTF16(entitySearch.mostRecentMatch()->value, decodedEntity);
    239             }
    240             unconsumeCharacters(source, consumedCharacters);
    241             return false;
    242         }
    243         }
    244         consumedCharacters.append(cc);
    245         source.advanceAndASSERT(cc);
    246     }
    247     ASSERT(source.isEmpty());
    248     notEnoughCharacters = true;
    249     unconsumeCharacters(source, consumedCharacters);
    250     return false;
    251 }
    252 
    253 UChar decodeNamedEntity(const char* name)
    254 {
    255     HTMLEntitySearch search;
    256     while (*name) {
    257         search.advance(*name++);
    258         if (!search.isEntityPrefix())
    259             return 0;
    260     }
    261     search.advance(';');
    262     UChar32 entityValue = search.currentValue();
    263     if (U16_LENGTH(entityValue) != 1) {
    264         // Callers need to move off this API if the entity table has values
    265         // which do no fit in a 16 bit UChar!
    266         ASSERT_NOT_REACHED();
    267         return 0;
    268     }
    269     return static_cast<UChar>(entityValue);
    270 }
    271 
    272 } // namespace WebCore
    273