1 /* 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ 4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 #include "core/html/parser/HTMLEntityParser.h" 30 31 #include "core/html/parser/HTMLEntitySearch.h" 32 #include "core/html/parser/HTMLEntityTable.h" 33 #include "wtf/text/StringBuilder.h" 34 35 using namespace WTF; 36 37 namespace blink { 38 39 static const UChar windowsLatin1ExtensionArray[32] = { 40 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 41 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F 42 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 43 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F 44 }; 45 46 static bool isAlphaNumeric(UChar cc) 47 { 48 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z'); 49 } 50 51 static UChar adjustEntity(UChar32 value) 52 { 53 if ((value & ~0x1F) != 0x0080) 54 return value; 55 return windowsLatin1ExtensionArray[value - 0x80]; 56 } 57 58 static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity) 59 { 60 // FIXME: A number of specific entity values generate parse errors. 61 if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) { 62 decodedEntity.append(0xFFFD); 63 return; 64 } 65 if (U_IS_BMP(c)) { 66 decodedEntity.append(adjustEntity(c)); 67 return; 68 } 69 decodedEntity.append(c); 70 } 71 72 static const UChar32 kInvalidUnicode = -1; 73 74 static bool isHexDigit(UChar cc) 75 { 76 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'); 77 } 78 79 static UChar asHexDigit(UChar cc) 80 { 81 if (cc >= '0' && cc <= '9') 82 return cc - '0'; 83 if (cc >= 'a' && cc <= 'z') 84 return 10 + cc - 'a'; 85 if (cc >= 'A' && cc <= 'Z') 86 return 10 + cc - 'A'; 87 ASSERT_NOT_REACHED(); 88 return 0; 89 } 90 91 typedef Vector<UChar, 64> ConsumedCharacterBuffer; 92 93 static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer& consumedCharacters) 94 { 95 if (consumedCharacters.size() == 1) 96 source.push(consumedCharacters[0]); 97 else if (consumedCharacters.size() == 2) { 98 source.push(consumedCharacters[0]); 99 source.push(consumedCharacters[1]); 100 } else 101 source.prepend(SegmentedString(String(consumedCharacters))); 102 } 103 104 static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc) 105 { 106 ConsumedCharacterBuffer consumedCharacters; 107 HTMLEntitySearch entitySearch; 108 while (!source.isEmpty()) { 109 cc = source.currentChar(); 110 entitySearch.advance(cc); 111 if (!entitySearch.isEntityPrefix()) 112 break; 113 consumedCharacters.append(cc); 114 source.advanceAndASSERT(cc); 115 } 116 notEnoughCharacters = source.isEmpty(); 117 if (notEnoughCharacters) { 118 // We can't decide on an entity because there might be a longer entity 119 // that we could match if we had more data. 120 unconsumeCharacters(source, consumedCharacters); 121 return false; 122 } 123 if (!entitySearch.mostRecentMatch()) { 124 unconsumeCharacters(source, consumedCharacters); 125 return false; 126 } 127 if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) { 128 // We've consumed too many characters. We need to walk the 129 // source back to the point at which we had consumed an 130 // actual entity. 131 unconsumeCharacters(source, consumedCharacters); 132 consumedCharacters.clear(); 133 const HTMLEntityTableEntry* mostRecent = entitySearch.mostRecentMatch(); 134 const int length = mostRecent->length; 135 const LChar* reference = HTMLEntityTable::entityString(*mostRecent); 136 for (int i = 0; i < length; ++i) { 137 cc = source.currentChar(); 138 ASSERT_UNUSED(reference, cc == static_cast<UChar>(*reference++)); 139 consumedCharacters.append(cc); 140 source.advanceAndASSERT(cc); 141 ASSERT(!source.isEmpty()); 142 } 143 cc = source.currentChar(); 144 } 145 if (entitySearch.mostRecentMatch()->lastCharacter() == ';' 146 || !additionalAllowedCharacter 147 || !(isAlphaNumeric(cc) || cc == '=')) { 148 decodedEntity.append(entitySearch.mostRecentMatch()->firstValue); 149 if (UChar32 second = entitySearch.mostRecentMatch()->secondValue) 150 decodedEntity.append(second); 151 return true; 152 } 153 unconsumeCharacters(source, consumedCharacters); 154 return false; 155 } 156 157 bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter) 158 { 159 ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>'); 160 ASSERT(!notEnoughCharacters); 161 ASSERT(decodedEntity.isEmpty()); 162 163 enum EntityState { 164 Initial, 165 Number, 166 MaybeHexLowerCaseX, 167 MaybeHexUpperCaseX, 168 Hex, 169 Decimal, 170 Named 171 }; 172 EntityState entityState = Initial; 173 UChar32 result = 0; 174 ConsumedCharacterBuffer consumedCharacters; 175 176 while (!source.isEmpty()) { 177 UChar cc = source.currentChar(); 178 switch (entityState) { 179 case Initial: { 180 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&') 181 return false; 182 if (additionalAllowedCharacter && cc == additionalAllowedCharacter) 183 return false; 184 if (cc == '#') { 185 entityState = Number; 186 break; 187 } 188 if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) { 189 entityState = Named; 190 continue; 191 } 192 return false; 193 } 194 case Number: { 195 if (cc == 'x') { 196 entityState = MaybeHexLowerCaseX; 197 break; 198 } 199 if (cc == 'X') { 200 entityState = MaybeHexUpperCaseX; 201 break; 202 } 203 if (cc >= '0' && cc <= '9') { 204 entityState = Decimal; 205 continue; 206 } 207 source.push('#'); 208 return false; 209 } 210 case MaybeHexLowerCaseX: { 211 if (isHexDigit(cc)) { 212 entityState = Hex; 213 continue; 214 } 215 source.push('#'); 216 source.push('x'); 217 return false; 218 } 219 case MaybeHexUpperCaseX: { 220 if (isHexDigit(cc)) { 221 entityState = Hex; 222 continue; 223 } 224 source.push('#'); 225 source.push('X'); 226 return false; 227 } 228 case Hex: { 229 if (isHexDigit(cc)) { 230 if (result != kInvalidUnicode) 231 result = result * 16 + asHexDigit(cc); 232 } else if (cc == ';') { 233 source.advanceAndASSERT(cc); 234 appendLegalEntityFor(result, decodedEntity); 235 return true; 236 } else { 237 appendLegalEntityFor(result, decodedEntity); 238 return true; 239 } 240 break; 241 } 242 case Decimal: { 243 if (cc >= '0' && cc <= '9') { 244 if (result != kInvalidUnicode) 245 result = result * 10 + cc - '0'; 246 } else if (cc == ';') { 247 source.advanceAndASSERT(cc); 248 appendLegalEntityFor(result, decodedEntity); 249 return true; 250 } else { 251 appendLegalEntityFor(result, decodedEntity); 252 return true; 253 } 254 break; 255 } 256 case Named: { 257 return consumeNamedEntity(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter, cc); 258 } 259 } 260 261 if (result > UCHAR_MAX_VALUE) 262 result = kInvalidUnicode; 263 264 consumedCharacters.append(cc); 265 source.advanceAndASSERT(cc); 266 } 267 ASSERT(source.isEmpty()); 268 notEnoughCharacters = true; 269 unconsumeCharacters(source, consumedCharacters); 270 return false; 271 } 272 273 static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result) 274 { 275 if (U_IS_BMP(value)) { 276 UChar character = static_cast<UChar>(value); 277 ASSERT(character == value); 278 result[0] = character; 279 return 1; 280 } 281 282 result[0] = U16_LEAD(value); 283 result[1] = U16_TRAIL(value); 284 return 2; 285 } 286 287 size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4]) 288 { 289 HTMLEntitySearch search; 290 while (*name) { 291 search.advance(*name++); 292 if (!search.isEntityPrefix()) 293 return 0; 294 } 295 search.advance(';'); 296 if (!search.isEntityPrefix()) 297 return 0; 298 299 size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result); 300 if (!search.mostRecentMatch()->secondValue) 301 return numberOfCodePoints; 302 return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints); 303 } 304 305 } // namespace blink 306