1 /* 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ 4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 #include "core/html/parser/HTMLEntityParser.h" 30 31 #include "core/html/parser/HTMLEntitySearch.h" 32 #include "core/html/parser/HTMLEntityTable.h" 33 #include "wtf/text/StringBuilder.h" 34 35 using namespace WTF; 36 37 namespace WebCore { 38 39 static const UChar windowsLatin1ExtensionArray[32] = { 40 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 41 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F 42 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 43 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F 44 }; 45 46 static bool isAlphaNumeric(UChar cc) 47 { 48 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z'); 49 } 50 51 static UChar adjustEntity(UChar32 value) 52 { 53 if ((value & ~0x1F) != 0x0080) 54 return value; 55 return windowsLatin1ExtensionArray[value - 0x80]; 56 } 57 58 static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity) 59 { 60 // FIXME: A number of specific entity values generate parse errors. 61 if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) { 62 decodedEntity.append(0xFFFD); 63 return; 64 } 65 if (U_IS_BMP(c)) { 66 decodedEntity.append(adjustEntity(c)); 67 return; 68 } 69 decodedEntity.append(c); 70 } 71 72 static const UChar32 kInvalidUnicode = -1; 73 74 static bool isHexDigit(UChar cc) 75 { 76 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'); 77 } 78 79 static UChar asHexDigit(UChar cc) 80 { 81 if (cc >= '0' && cc <= '9') 82 return cc - '0'; 83 if (cc >= 'a' && cc <= 'z') 84 return 10 + cc - 'a'; 85 if (cc >= 'A' && cc <= 'Z') 86 return 10 + cc - 'A'; 87 ASSERT_NOT_REACHED(); 88 return 0; 89 } 90 91 typedef Vector<UChar, 64> ConsumedCharacterBuffer; 92 93 static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer& consumedCharacters) 94 { 95 if (consumedCharacters.size() == 1) 96 source.push(consumedCharacters[0]); 97 else if (consumedCharacters.size() == 2) { 98 source.push(consumedCharacters[0]); 99 source.push(consumedCharacters[1]); 100 } else 101 source.prepend(SegmentedString(String(consumedCharacters))); 102 } 103 104 static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc) 105 { 106 ConsumedCharacterBuffer consumedCharacters; 107 HTMLEntitySearch entitySearch; 108 while (!source.isEmpty()) { 109 cc = source.currentChar(); 110 entitySearch.advance(cc); 111 if (!entitySearch.isEntityPrefix()) 112 break; 113 consumedCharacters.append(cc); 114 source.advanceAndASSERT(cc); 115 } 116 notEnoughCharacters = source.isEmpty(); 117 if (notEnoughCharacters) { 118 // We can't an entity because there might be a longer entity 119 // that we could match if we had more data. 120 unconsumeCharacters(source, consumedCharacters); 121 return false; 122 } 123 if (!entitySearch.mostRecentMatch()) { 124 unconsumeCharacters(source, consumedCharacters); 125 return false; 126 } 127 if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) { 128 // We've consumed too many characters. We need to walk the 129 // source back to the point at which we had consumed an 130 // actual entity. 131 unconsumeCharacters(source, consumedCharacters); 132 consumedCharacters.clear(); 133 const int length = entitySearch.mostRecentMatch()->length; 134 const UChar* reference = entitySearch.mostRecentMatch()->entity; 135 for (int i = 0; i < length; ++i) { 136 cc = source.currentChar(); 137 ASSERT_UNUSED(reference, cc == *reference++); 138 consumedCharacters.append(cc); 139 source.advanceAndASSERT(cc); 140 ASSERT(!source.isEmpty()); 141 } 142 cc = source.currentChar(); 143 } 144 if (entitySearch.mostRecentMatch()->lastCharacter() == ';' 145 || !additionalAllowedCharacter 146 || !(isAlphaNumeric(cc) || cc == '=')) { 147 decodedEntity.append(entitySearch.mostRecentMatch()->firstValue); 148 if (UChar32 second = entitySearch.mostRecentMatch()->secondValue) 149 decodedEntity.append(second); 150 return true; 151 } 152 unconsumeCharacters(source, consumedCharacters); 153 return false; 154 } 155 156 bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter) 157 { 158 ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>'); 159 ASSERT(!notEnoughCharacters); 160 ASSERT(decodedEntity.isEmpty()); 161 162 enum EntityState { 163 Initial, 164 Number, 165 MaybeHexLowerCaseX, 166 MaybeHexUpperCaseX, 167 Hex, 168 Decimal, 169 Named 170 }; 171 EntityState entityState = Initial; 172 UChar32 result = 0; 173 ConsumedCharacterBuffer consumedCharacters; 174 175 while (!source.isEmpty()) { 176 UChar cc = source.currentChar(); 177 switch (entityState) { 178 case Initial: { 179 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&') 180 return false; 181 if (additionalAllowedCharacter && cc == additionalAllowedCharacter) 182 return false; 183 if (cc == '#') { 184 entityState = Number; 185 break; 186 } 187 if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) { 188 entityState = Named; 189 continue; 190 } 191 return false; 192 } 193 case Number: { 194 if (cc == 'x') { 195 entityState = MaybeHexLowerCaseX; 196 break; 197 } 198 if (cc == 'X') { 199 entityState = MaybeHexUpperCaseX; 200 break; 201 } 202 if (cc >= '0' && cc <= '9') { 203 entityState = Decimal; 204 continue; 205 } 206 source.push('#'); 207 return false; 208 } 209 case MaybeHexLowerCaseX: { 210 if (isHexDigit(cc)) { 211 entityState = Hex; 212 continue; 213 } 214 source.push('#'); 215 source.push('x'); 216 return false; 217 } 218 case MaybeHexUpperCaseX: { 219 if (isHexDigit(cc)) { 220 entityState = Hex; 221 continue; 222 } 223 source.push('#'); 224 source.push('X'); 225 return false; 226 } 227 case Hex: { 228 if (isHexDigit(cc)) { 229 if (result != kInvalidUnicode) 230 result = result * 16 + asHexDigit(cc); 231 } else if (cc == ';') { 232 source.advanceAndASSERT(cc); 233 appendLegalEntityFor(result, decodedEntity); 234 return true; 235 } else { 236 appendLegalEntityFor(result, decodedEntity); 237 return true; 238 } 239 break; 240 } 241 case Decimal: { 242 if (cc >= '0' && cc <= '9') { 243 if (result != kInvalidUnicode) 244 result = result * 10 + cc - '0'; 245 } else if (cc == ';') { 246 source.advanceAndASSERT(cc); 247 appendLegalEntityFor(result, decodedEntity); 248 return true; 249 } else { 250 appendLegalEntityFor(result, decodedEntity); 251 return true; 252 } 253 break; 254 } 255 case Named: { 256 return consumeNamedEntity(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter, cc); 257 } 258 } 259 260 if (result > UCHAR_MAX_VALUE) 261 result = kInvalidUnicode; 262 263 consumedCharacters.append(cc); 264 source.advanceAndASSERT(cc); 265 } 266 ASSERT(source.isEmpty()); 267 notEnoughCharacters = true; 268 unconsumeCharacters(source, consumedCharacters); 269 return false; 270 } 271 272 static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result) 273 { 274 if (U_IS_BMP(value)) { 275 UChar character = static_cast<UChar>(value); 276 ASSERT(character == value); 277 result[0] = character; 278 return 1; 279 } 280 281 result[0] = U16_LEAD(value); 282 result[1] = U16_TRAIL(value); 283 return 2; 284 } 285 286 size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4]) 287 { 288 HTMLEntitySearch search; 289 while (*name) { 290 search.advance(*name++); 291 if (!search.isEntityPrefix()) 292 return 0; 293 } 294 search.advance(';'); 295 if (!search.isEntityPrefix()) 296 return 0; 297 298 size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result); 299 if (!search.mostRecentMatch()->secondValue) 300 return numberOfCodePoints; 301 return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints); 302 } 303 304 } // namespace WebCore 305