1 /* 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ 4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 #include "HTMLEntityParser.h" 30 31 #include "HTMLEntitySearch.h" 32 #include "HTMLEntityTable.h" 33 #include <wtf/Vector.h> 34 35 using namespace WTF; 36 37 namespace WebCore { 38 39 namespace { 40 41 static const UChar windowsLatin1ExtensionArray[32] = { 42 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 43 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F 44 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 45 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F 46 }; 47 48 inline UChar adjustEntity(UChar32 value) 49 { 50 if ((value & ~0x1F) != 0x0080) 51 return value; 52 return windowsLatin1ExtensionArray[value - 0x80]; 53 } 54 55 inline UChar32 legalEntityFor(UChar32 value) 56 { 57 // FIXME: A number of specific entity values generate parse errors. 58 if (value == 0 || value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) 59 return 0xFFFD; 60 if (U_IS_BMP(value)) 61 return adjustEntity(value); 62 return value; 63 } 64 65 inline bool convertToUTF16(UChar32 value, Vector<UChar, 16>& decodedEntity) 66 { 67 if (U_IS_BMP(value)) { 68 UChar character = static_cast<UChar>(value); 69 ASSERT(character == value); 70 decodedEntity.append(character); 71 return true; 72 } 73 decodedEntity.append(U16_LEAD(value)); 74 decodedEntity.append(U16_TRAIL(value)); 75 return true; 76 } 77 78 inline bool isHexDigit(UChar cc) 79 { 80 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'); 81 } 82 83 inline bool isAlphaNumeric(UChar cc) 84 { 85 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z'); 86 } 87 88 void unconsumeCharacters(SegmentedString& source, const Vector<UChar, 10>& consumedCharacters) 89 { 90 if (consumedCharacters.size() == 1) 91 source.push(consumedCharacters[0]); 92 else if (consumedCharacters.size() == 2) { 93 source.push(consumedCharacters[0]); 94 source.push(consumedCharacters[1]); 95 } else 96 source.prepend(SegmentedString(String(consumedCharacters.data(), consumedCharacters.size()))); 97 } 98 99 } 100 101 bool consumeHTMLEntity(SegmentedString& source, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter) 102 { 103 ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>'); 104 ASSERT(!notEnoughCharacters); 105 ASSERT(decodedEntity.isEmpty()); 106 107 enum EntityState { 108 Initial, 109 Number, 110 MaybeHexLowerCaseX, 111 MaybeHexUpperCaseX, 112 Hex, 113 Decimal, 114 Named 115 }; 116 EntityState entityState = Initial; 117 UChar32 result = 0; 118 Vector<UChar, 10> consumedCharacters; 119 120 while (!source.isEmpty()) { 121 UChar cc = *source; 122 switch (entityState) { 123 case Initial: { 124 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&') 125 return false; 126 if (additionalAllowedCharacter && cc == additionalAllowedCharacter) 127 return false; 128 if (cc == '#') { 129 entityState = Number; 130 break; 131 } 132 if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) { 133 entityState = Named; 134 continue; 135 } 136 return false; 137 } 138 case Number: { 139 if (cc == 'x') { 140 entityState = MaybeHexLowerCaseX; 141 break; 142 } 143 if (cc == 'X') { 144 entityState = MaybeHexUpperCaseX; 145 break; 146 } 147 if (cc >= '0' && cc <= '9') { 148 entityState = Decimal; 149 continue; 150 } 151 source.push('#'); 152 return false; 153 } 154 case MaybeHexLowerCaseX: { 155 if (isHexDigit(cc)) { 156 entityState = Hex; 157 continue; 158 } 159 source.push('#'); 160 source.push('x'); 161 return false; 162 } 163 case MaybeHexUpperCaseX: { 164 if (isHexDigit(cc)) { 165 entityState = Hex; 166 continue; 167 } 168 source.push('#'); 169 source.push('X'); 170 return false; 171 } 172 case Hex: { 173 if (cc >= '0' && cc <= '9') 174 result = result * 16 + cc - '0'; 175 else if (cc >= 'a' && cc <= 'f') 176 result = result * 16 + 10 + cc - 'a'; 177 else if (cc >= 'A' && cc <= 'F') 178 result = result * 16 + 10 + cc - 'A'; 179 else { 180 if (cc == ';') 181 source.advanceAndASSERT(cc); 182 return convertToUTF16(legalEntityFor(result), decodedEntity); 183 } 184 break; 185 } 186 case Decimal: { 187 if (cc >= '0' && cc <= '9') 188 result = result * 10 + cc - '0'; 189 else { 190 if (cc == ';') 191 source.advanceAndASSERT(cc); 192 return convertToUTF16(legalEntityFor(result), decodedEntity); 193 } 194 break; 195 } 196 case Named: { 197 HTMLEntitySearch entitySearch; 198 while (!source.isEmpty()) { 199 cc = *source; 200 entitySearch.advance(cc); 201 if (!entitySearch.isEntityPrefix()) 202 break; 203 consumedCharacters.append(cc); 204 source.advanceAndASSERT(cc); 205 } 206 notEnoughCharacters = source.isEmpty(); 207 if (notEnoughCharacters) { 208 // We can't an entity because there might be a longer entity 209 // that we could match if we had more data. 210 unconsumeCharacters(source, consumedCharacters); 211 return false; 212 } 213 if (!entitySearch.mostRecentMatch()) { 214 ASSERT(!entitySearch.currentValue()); 215 unconsumeCharacters(source, consumedCharacters); 216 return false; 217 } 218 if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) { 219 // We've consumed too many characters. We need to walk the 220 // source back to the point at which we had consumed an 221 // actual entity. 222 unconsumeCharacters(source, consumedCharacters); 223 consumedCharacters.clear(); 224 const int length = entitySearch.mostRecentMatch()->length; 225 const UChar* reference = entitySearch.mostRecentMatch()->entity; 226 for (int i = 0; i < length; ++i) { 227 cc = *source; 228 ASSERT_UNUSED(reference, cc == *reference++); 229 consumedCharacters.append(cc); 230 source.advanceAndASSERT(cc); 231 ASSERT(!source.isEmpty()); 232 } 233 cc = *source; 234 } 235 if (entitySearch.mostRecentMatch()->lastCharacter() == ';' 236 || !additionalAllowedCharacter 237 || !(isAlphaNumeric(cc) || cc == '=')) { 238 return convertToUTF16(entitySearch.mostRecentMatch()->value, decodedEntity); 239 } 240 unconsumeCharacters(source, consumedCharacters); 241 return false; 242 } 243 } 244 consumedCharacters.append(cc); 245 source.advanceAndASSERT(cc); 246 } 247 ASSERT(source.isEmpty()); 248 notEnoughCharacters = true; 249 unconsumeCharacters(source, consumedCharacters); 250 return false; 251 } 252 253 UChar decodeNamedEntity(const char* name) 254 { 255 HTMLEntitySearch search; 256 while (*name) { 257 search.advance(*name++); 258 if (!search.isEntityPrefix()) 259 return 0; 260 } 261 search.advance(';'); 262 UChar32 entityValue = search.currentValue(); 263 if (U16_LENGTH(entityValue) != 1) { 264 // Callers need to move off this API if the entity table has values 265 // which do no fit in a 16 bit UChar! 266 ASSERT_NOT_REACHED(); 267 return 0; 268 } 269 return static_cast<UChar>(entityValue); 270 } 271 272 } // namespace WebCore 273