1 /* 2 * Copyright (C) 2004, 2006, 2007, 2008 Apple Inc. All rights reserved. 3 * Copyright (C) 2006 Alexey Proskuryakov <ap (at) nypop.com> 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "config.h" 28 #include "TextCodecICU.h" 29 30 #include "CharacterNames.h" 31 #include "CString.h" 32 #include "PlatformString.h" 33 #include "ThreadGlobalData.h" 34 #include <unicode/ucnv.h> 35 #include <unicode/ucnv_cb.h> 36 #include <wtf/Assertions.h> 37 #include <wtf/PassOwnPtr.h> 38 #include <wtf/StringExtras.h> 39 #include <wtf/Threading.h> 40 41 using std::min; 42 43 namespace WebCore { 44 45 const size_t ConversionBufferSize = 16384; 46 47 ICUConverterWrapper::~ICUConverterWrapper() 48 { 49 if (converter) 50 ucnv_close(converter); 51 } 52 53 static UConverter*& cachedConverterICU() 54 { 55 return threadGlobalData().cachedConverterICU().converter; 56 } 57 58 static PassOwnPtr<TextCodec> newTextCodecICU(const TextEncoding& encoding, const void*) 59 { 60 return new TextCodecICU(encoding); 61 } 62 63 void TextCodecICU::registerBaseEncodingNames(EncodingNameRegistrar registrar) 64 { 65 registrar("UTF-8", "UTF-8"); 66 } 67 68 void TextCodecICU::registerBaseCodecs(TextCodecRegistrar registrar) 69 { 70 registrar("UTF-8", newTextCodecICU, 0); 71 } 72 73 // FIXME: Registering all the encodings we get from ucnv_getAvailableName 74 // includes encodings we don't want or need. For example, all 75 // the encodings with commas and version numbers. 76 77 void TextCodecICU::registerExtendedEncodingNames(EncodingNameRegistrar registrar) 78 { 79 // We register Hebrew with logical ordering using a separate name. 80 // Otherwise, this would share the same canonical name as the 81 // visual ordering case, and then TextEncoding could not tell them 82 // apart; ICU treats these names as synonyms. 83 registrar("ISO-8859-8-I", "ISO-8859-8-I"); 84 85 int32_t numEncodings = ucnv_countAvailable(); 86 for (int32_t i = 0; i < numEncodings; ++i) { 87 const char* name = ucnv_getAvailableName(i); 88 UErrorCode error = U_ZERO_ERROR; 89 // Try MIME before trying IANA to pick up commonly used names like 90 // 'EUC-JP' instead of horrendously long names like 91 // 'Extended_UNIX_Code_Packed_Format_for_Japanese'. 92 const char* standardName = ucnv_getStandardName(name, "MIME", &error); 93 if (!U_SUCCESS(error) || !standardName) { 94 error = U_ZERO_ERROR; 95 // Try IANA to pick up 'windows-12xx' and other names 96 // which are not preferred MIME names but are widely used. 97 standardName = ucnv_getStandardName(name, "IANA", &error); 98 if (!U_SUCCESS(error) || !standardName) 99 continue; 100 } 101 102 // 1. Treat GB2312 encoding as GBK (its more modern superset), to match other browsers. 103 // 2. On the Web, GB2312 is encoded as EUC-CN or HZ, while ICU provides a native encoding 104 // for encoding GB_2312-80 and several others. So, we need to override this behavior, too. 105 if (strcmp(standardName, "GB2312") == 0 || strcmp(standardName, "GB_2312-80") == 0) 106 standardName = "GBK"; 107 // Similarly, EUC-KR encodings all map to an extended version. 108 else if (strcmp(standardName, "KSC_5601") == 0 || strcmp(standardName, "EUC-KR") == 0 || strcmp(standardName, "cp1363") == 0) 109 standardName = "windows-949"; 110 // And so on. 111 else if (strcasecmp(standardName, "iso-8859-9") == 0) // This name is returned in different case by ICU 3.2 and 3.6. 112 standardName = "windows-1254"; 113 else if (strcmp(standardName, "TIS-620") == 0) 114 standardName = "windows-874"; 115 116 registrar(standardName, standardName); 117 118 uint16_t numAliases = ucnv_countAliases(name, &error); 119 ASSERT(U_SUCCESS(error)); 120 if (U_SUCCESS(error)) 121 for (uint16_t j = 0; j < numAliases; ++j) { 122 error = U_ZERO_ERROR; 123 const char* alias = ucnv_getAlias(name, j, &error); 124 ASSERT(U_SUCCESS(error)); 125 if (U_SUCCESS(error) && alias != standardName) 126 registrar(alias, standardName); 127 } 128 } 129 130 // Additional aliases. 131 // These are present in modern versions of ICU, but not in ICU 3.2 (shipped with Mac OS X 10.4). 132 registrar("macroman", "macintosh"); 133 registrar("maccyrillic", "x-mac-cyrillic"); 134 135 // Additional aliases that historically were present in the encoding 136 // table in WebKit on Macintosh that don't seem to be present in ICU. 137 // Perhaps we can prove these are not used on the web and remove them. 138 // Or perhaps we can get them added to ICU. 139 registrar("xmacroman", "macintosh"); 140 registrar("xmacukrainian", "x-mac-cyrillic"); 141 registrar("cnbig5", "Big5"); 142 registrar("xxbig5", "Big5"); 143 registrar("cngb", "GBK"); 144 registrar("csgb231280", "GBK"); 145 registrar("xeuccn", "GBK"); 146 registrar("xgbk", "GBK"); 147 registrar("csISO88598I", "ISO_8859-8-I"); 148 registrar("koi", "KOI8-R"); 149 registrar("logical", "ISO-8859-8-I"); 150 registrar("unicode11utf8", "UTF-8"); 151 registrar("unicode20utf8", "UTF-8"); 152 registrar("xunicode20utf8", "UTF-8"); 153 registrar("visual", "ISO-8859-8"); 154 registrar("winarabic", "windows-1256"); 155 registrar("winbaltic", "windows-1257"); 156 registrar("wincyrillic", "windows-1251"); 157 registrar("iso885911", "windows-874"); 158 registrar("dos874", "windows-874"); 159 registrar("wingreek", "windows-1253"); 160 registrar("winhebrew", "windows-1255"); 161 registrar("winlatin2", "windows-1250"); 162 registrar("winturkish", "windows-1254"); 163 registrar("winvietnamese", "windows-1258"); 164 registrar("xcp1250", "windows-1250"); 165 registrar("xcp1251", "windows-1251"); 166 registrar("xeuc", "EUC-JP"); 167 registrar("xwindows949", "windows-949"); 168 registrar("xuhc", "windows-949"); 169 170 // These aliases are present in modern versions of ICU, but use different codecs, and have no standard names. 171 // They are not present in ICU 3.2. 172 registrar("dos720", "cp864"); 173 registrar("jis7", "ISO-2022-JP"); 174 } 175 176 void TextCodecICU::registerExtendedCodecs(TextCodecRegistrar registrar) 177 { 178 // See comment above in registerEncodingNames. 179 registrar("ISO-8859-8-I", newTextCodecICU, 0); 180 181 int32_t numEncodings = ucnv_countAvailable(); 182 for (int32_t i = 0; i < numEncodings; ++i) { 183 const char* name = ucnv_getAvailableName(i); 184 UErrorCode error = U_ZERO_ERROR; 185 const char* standardName = ucnv_getStandardName(name, "MIME", &error); 186 if (!U_SUCCESS(error) || !standardName) { 187 error = U_ZERO_ERROR; 188 standardName = ucnv_getStandardName(name, "IANA", &error); 189 if (!U_SUCCESS(error) || !standardName) 190 continue; 191 } 192 registrar(standardName, newTextCodecICU, 0); 193 } 194 } 195 196 TextCodecICU::TextCodecICU(const TextEncoding& encoding) 197 : m_encoding(encoding) 198 , m_numBufferedBytes(0) 199 , m_converterICU(0) 200 , m_needsGBKFallbacks(false) 201 { 202 } 203 204 TextCodecICU::~TextCodecICU() 205 { 206 releaseICUConverter(); 207 } 208 209 void TextCodecICU::releaseICUConverter() const 210 { 211 if (m_converterICU) { 212 UConverter*& cachedConverter = cachedConverterICU(); 213 if (cachedConverter) 214 ucnv_close(cachedConverter); 215 cachedConverter = m_converterICU; 216 m_converterICU = 0; 217 } 218 } 219 220 void TextCodecICU::createICUConverter() const 221 { 222 ASSERT(!m_converterICU); 223 224 const char* name = m_encoding.name(); 225 m_needsGBKFallbacks = name[0] == 'G' && name[1] == 'B' && name[2] == 'K' && !name[3]; 226 227 UErrorCode err; 228 229 UConverter*& cachedConverter = cachedConverterICU(); 230 if (cachedConverter) { 231 err = U_ZERO_ERROR; 232 const char* cachedName = ucnv_getName(cachedConverter, &err); 233 if (U_SUCCESS(err) && m_encoding == cachedName) { 234 m_converterICU = cachedConverter; 235 cachedConverter = 0; 236 return; 237 } 238 } 239 240 err = U_ZERO_ERROR; 241 m_converterICU = ucnv_open(m_encoding.name(), &err); 242 #if !LOG_DISABLED 243 if (err == U_AMBIGUOUS_ALIAS_WARNING) 244 LOG_ERROR("ICU ambiguous alias warning for encoding: %s", m_encoding.name()); 245 #endif 246 if (m_converterICU) 247 ucnv_setFallback(m_converterICU, TRUE); 248 } 249 250 int TextCodecICU::decodeToBuffer(UChar* target, UChar* targetLimit, const char*& source, const char* sourceLimit, int32_t* offsets, bool flush, UErrorCode& err) 251 { 252 UChar* targetStart = target; 253 err = U_ZERO_ERROR; 254 ucnv_toUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, offsets, flush, &err); 255 return target - targetStart; 256 } 257 258 class ErrorCallbackSetter { 259 public: 260 ErrorCallbackSetter(UConverter* converter, bool stopOnError) 261 : m_converter(converter) 262 , m_shouldStopOnEncodingErrors(stopOnError) 263 { 264 if (m_shouldStopOnEncodingErrors) { 265 UErrorCode err = U_ZERO_ERROR; 266 ucnv_setToUCallBack(m_converter, UCNV_TO_U_CALLBACK_SUBSTITUTE, 267 UCNV_SUB_STOP_ON_ILLEGAL, &m_savedAction, 268 &m_savedContext, &err); 269 ASSERT(err == U_ZERO_ERROR); 270 } 271 } 272 ~ErrorCallbackSetter() 273 { 274 if (m_shouldStopOnEncodingErrors) { 275 UErrorCode err = U_ZERO_ERROR; 276 const void* oldContext; 277 UConverterToUCallback oldAction; 278 ucnv_setToUCallBack(m_converter, m_savedAction, 279 m_savedContext, &oldAction, 280 &oldContext, &err); 281 ASSERT(oldAction == UCNV_TO_U_CALLBACK_SUBSTITUTE); 282 ASSERT(!strcmp(static_cast<const char*>(oldContext), UCNV_SUB_STOP_ON_ILLEGAL)); 283 ASSERT(err == U_ZERO_ERROR); 284 } 285 } 286 private: 287 UConverter* m_converter; 288 bool m_shouldStopOnEncodingErrors; 289 const void* m_savedContext; 290 UConverterToUCallback m_savedAction; 291 }; 292 293 String TextCodecICU::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) 294 { 295 // Get a converter for the passed-in encoding. 296 if (!m_converterICU) { 297 createICUConverter(); 298 ASSERT(m_converterICU); 299 if (!m_converterICU) { 300 LOG_ERROR("error creating ICU encoder even though encoding was in table"); 301 return String(); 302 } 303 } 304 305 ErrorCallbackSetter callbackSetter(m_converterICU, stopOnError); 306 307 Vector<UChar> result; 308 309 UChar buffer[ConversionBufferSize]; 310 UChar* bufferLimit = buffer + ConversionBufferSize; 311 const char* source = reinterpret_cast<const char*>(bytes); 312 const char* sourceLimit = source + length; 313 int32_t* offsets = NULL; 314 UErrorCode err = U_ZERO_ERROR; 315 316 do { 317 int ucharsDecoded = decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, flush, err); 318 result.append(buffer, ucharsDecoded); 319 } while (err == U_BUFFER_OVERFLOW_ERROR); 320 321 if (U_FAILURE(err)) { 322 // flush the converter so it can be reused, and not be bothered by this error. 323 do { 324 decodeToBuffer(buffer, bufferLimit, source, sourceLimit, offsets, true, err); 325 } while (source < sourceLimit); 326 sawError = true; 327 } 328 329 String resultString = String::adopt(result); 330 331 // <http://bugs.webkit.org/show_bug.cgi?id=17014> 332 // Simplified Chinese pages use the code A3A0 to mean "full-width space", but ICU decodes it as U+E5E5. 333 if (strcmp(m_encoding.name(), "GBK") == 0 || strcasecmp(m_encoding.name(), "gb18030") == 0) 334 resultString.replace(0xE5E5, ideographicSpace); 335 336 return resultString; 337 } 338 339 // We need to apply these fallbacks ourselves as they are not currently supported by ICU and 340 // they were provided by the old TEC encoding path 341 // Needed to fix <rdar://problem/4708689> 342 static UChar getGbkEscape(UChar32 codePoint) 343 { 344 switch (codePoint) { 345 case 0x01F9: 346 return 0xE7C8; 347 case 0x1E3F: 348 return 0xE7C7; 349 case 0x22EF: 350 return 0x2026; 351 case 0x301C: 352 return 0xFF5E; 353 default: 354 return 0; 355 } 356 } 357 358 // Invalid character handler when writing escaped entities for unrepresentable 359 // characters. See the declaration of TextCodec::encode for more. 360 static void urlEscapedEntityCallback(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, 361 UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err) 362 { 363 if (reason == UCNV_UNASSIGNED) { 364 *err = U_ZERO_ERROR; 365 366 UnencodableReplacementArray entity; 367 int entityLen = TextCodec::getUnencodableReplacement(codePoint, URLEncodedEntitiesForUnencodables, entity); 368 ucnv_cbFromUWriteBytes(fromUArgs, entity, entityLen, 0, err); 369 } else 370 UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, err); 371 } 372 373 // Substitutes special GBK characters, escaping all other unassigned entities. 374 static void gbkCallbackEscape(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, 375 UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err) 376 { 377 UChar outChar; 378 if (reason == UCNV_UNASSIGNED && (outChar = getGbkEscape(codePoint))) { 379 const UChar* source = &outChar; 380 *err = U_ZERO_ERROR; 381 ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, err); 382 return; 383 } 384 UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, err); 385 } 386 387 // Combines both gbkUrlEscapedEntityCallback and GBK character substitution. 388 static void gbkUrlEscapedEntityCallack(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, 389 UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err) 390 { 391 if (reason == UCNV_UNASSIGNED) { 392 if (UChar outChar = getGbkEscape(codePoint)) { 393 const UChar* source = &outChar; 394 *err = U_ZERO_ERROR; 395 ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, err); 396 return; 397 } 398 urlEscapedEntityCallback(context, fromUArgs, codeUnits, length, codePoint, reason, err); 399 return; 400 } 401 UCNV_FROM_U_CALLBACK_ESCAPE(context, fromUArgs, codeUnits, length, codePoint, reason, err); 402 } 403 404 static void gbkCallbackSubstitute(const void* context, UConverterFromUnicodeArgs* fromUArgs, const UChar* codeUnits, int32_t length, 405 UChar32 codePoint, UConverterCallbackReason reason, UErrorCode* err) 406 { 407 UChar outChar; 408 if (reason == UCNV_UNASSIGNED && (outChar = getGbkEscape(codePoint))) { 409 const UChar* source = &outChar; 410 *err = U_ZERO_ERROR; 411 ucnv_cbFromUWriteUChars(fromUArgs, &source, source + 1, 0, err); 412 return; 413 } 414 UCNV_FROM_U_CALLBACK_SUBSTITUTE(context, fromUArgs, codeUnits, length, codePoint, reason, err); 415 } 416 417 CString TextCodecICU::encode(const UChar* characters, size_t length, UnencodableHandling handling) 418 { 419 if (!length) 420 return ""; 421 422 if (!m_converterICU) 423 createICUConverter(); 424 if (!m_converterICU) 425 return CString(); 426 427 // FIXME: We should see if there is "force ASCII range" mode in ICU; 428 // until then, we change the backslash into a yen sign. 429 // Encoding will change the yen sign back into a backslash. 430 String copy(characters, length); 431 copy = m_encoding.displayString(copy.impl()); 432 433 const UChar* source = copy.characters(); 434 const UChar* sourceLimit = source + copy.length(); 435 436 UErrorCode err = U_ZERO_ERROR; 437 438 switch (handling) { 439 case QuestionMarksForUnencodables: 440 ucnv_setSubstChars(m_converterICU, "?", 1, &err); 441 ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackSubstitute : UCNV_FROM_U_CALLBACK_SUBSTITUTE, 0, 0, 0, &err); 442 break; 443 case EntitiesForUnencodables: 444 ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkCallbackEscape : UCNV_FROM_U_CALLBACK_ESCAPE, UCNV_ESCAPE_XML_DEC, 0, 0, &err); 445 break; 446 case URLEncodedEntitiesForUnencodables: 447 ucnv_setFromUCallBack(m_converterICU, m_needsGBKFallbacks ? gbkUrlEscapedEntityCallack : urlEscapedEntityCallback, 0, 0, 0, &err); 448 break; 449 } 450 451 ASSERT(U_SUCCESS(err)); 452 if (U_FAILURE(err)) 453 return CString(); 454 455 Vector<char> result; 456 size_t size = 0; 457 do { 458 char buffer[ConversionBufferSize]; 459 char* target = buffer; 460 char* targetLimit = target + ConversionBufferSize; 461 err = U_ZERO_ERROR; 462 ucnv_fromUnicode(m_converterICU, &target, targetLimit, &source, sourceLimit, 0, true, &err); 463 size_t count = target - buffer; 464 result.grow(size + count); 465 memcpy(result.data() + size, buffer, count); 466 size += count; 467 } while (err == U_BUFFER_OVERFLOW_ERROR); 468 469 return CString(result.data(), size); 470 } 471 472 473 } // namespace WebCore 474