1 /* 2 * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved. 3 * Copyright (C) 2007-2009 Torch Mobile, Inc. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "config.h" 28 #include "TextEncodingRegistry.h" 29 30 #include "TextCodecLatin1.h" 31 #include "TextCodecUserDefined.h" 32 #include "TextCodecUTF16.h" 33 #include "TextCodecUTF8.h" 34 #include "TextEncoding.h" 35 #include <wtf/ASCIICType.h> 36 #include <wtf/HashMap.h> 37 #include <wtf/HashSet.h> 38 #include <wtf/StdLibExtras.h> 39 #include <wtf/StringExtras.h> 40 #include <wtf/Threading.h> 41 42 #if USE(ICU_UNICODE) 43 #include "TextCodecICU.h" 44 #endif 45 #if PLATFORM(MAC) 46 #include "TextCodecMac.h" 47 #endif 48 #if PLATFORM(QT) 49 #include "qt/TextCodecQt.h" 50 #endif 51 #if USE(GLIB_UNICODE) 52 #include "gtk/TextCodecGtk.h" 53 #endif 54 #if USE(BREWMP_UNICODE) 55 #include "brew/TextCodecBrew.h" 56 #endif 57 #if OS(WINCE) && !PLATFORM(QT) 58 #include "TextCodecWinCE.h" 59 #endif 60 61 #include <wtf/CurrentTime.h> 62 #include <wtf/text/CString.h> 63 64 using namespace WTF; 65 66 namespace WebCore { 67 68 const size_t maxEncodingNameLength = 63; 69 70 // Hash for all-ASCII strings that does case folding. 71 struct TextEncodingNameHash { 72 static bool equal(const char* s1, const char* s2) 73 { 74 char c1; 75 char c2; 76 do { 77 c1 = *s1++; 78 c2 = *s2++; 79 if (toASCIILower(c1) != toASCIILower(c2)) 80 return false; 81 } while (c1 && c2); 82 return !c1 && !c2; 83 } 84 85 // This algorithm is the one-at-a-time hash from: 86 // http://burtleburtle.net/bob/hash/hashfaq.html 87 // http://burtleburtle.net/bob/hash/doobs.html 88 static unsigned hash(const char* s) 89 { 90 unsigned h = WTF::stringHashingStartValue; 91 for (;;) { 92 char c = *s++; 93 if (!c) { 94 h += (h << 3); 95 h ^= (h >> 11); 96 h += (h << 15); 97 return h; 98 } 99 h += toASCIILower(c); 100 h += (h << 10); 101 h ^= (h >> 6); 102 } 103 } 104 105 static const bool safeToCompareToEmptyOrDeleted = false; 106 }; 107 108 struct TextCodecFactory { 109 NewTextCodecFunction function; 110 const void* additionalData; 111 TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { } 112 }; 113 114 typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap; 115 typedef HashMap<const char*, TextCodecFactory> TextCodecMap; 116 117 static Mutex& encodingRegistryMutex() 118 { 119 // We don't have to use AtomicallyInitializedStatic here because 120 // this function is called on the main thread for any page before 121 // it is used in worker threads. 122 DEFINE_STATIC_LOCAL(Mutex, mutex, ()); 123 return mutex; 124 } 125 126 static TextEncodingNameMap* textEncodingNameMap; 127 static TextCodecMap* textCodecMap; 128 static bool didExtendTextCodecMaps; 129 static HashSet<const char*>* japaneseEncodings; 130 static HashSet<const char*>* nonBackslashEncodings; 131 132 static const char* const textEncodingNameBlacklist[] = { "UTF-7" }; 133 134 #if ERROR_DISABLED 135 136 static inline void checkExistingName(const char*, const char*) { } 137 138 #else 139 140 static void checkExistingName(const char* alias, const char* atomicName) 141 { 142 const char* oldAtomicName = textEncodingNameMap->get(alias); 143 if (!oldAtomicName) 144 return; 145 if (oldAtomicName == atomicName) 146 return; 147 // Keep the warning silent about one case where we know this will happen. 148 if (strcmp(alias, "ISO-8859-8-I") == 0 149 && strcmp(oldAtomicName, "ISO-8859-8-I") == 0 150 && strcasecmp(atomicName, "iso-8859-8") == 0) 151 return; 152 LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName); 153 } 154 155 #endif 156 157 static bool isUndesiredAlias(const char* alias) 158 { 159 // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU). 160 for (const char* p = alias; *p; ++p) { 161 if (*p == ',') 162 return true; 163 } 164 // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility 165 // problem, see bug 43554. 166 if (0 == strcmp(alias, "8859_1")) 167 return true; 168 return false; 169 } 170 171 static void addToTextEncodingNameMap(const char* alias, const char* name) 172 { 173 ASSERT(strlen(alias) <= maxEncodingNameLength); 174 if (isUndesiredAlias(alias)) 175 return; 176 const char* atomicName = textEncodingNameMap->get(name); 177 ASSERT(strcmp(alias, name) == 0 || atomicName); 178 if (!atomicName) 179 atomicName = name; 180 checkExistingName(alias, atomicName); 181 textEncodingNameMap->add(alias, atomicName); 182 } 183 184 static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData) 185 { 186 const char* atomicName = textEncodingNameMap->get(name); 187 ASSERT(atomicName); 188 textCodecMap->add(atomicName, TextCodecFactory(function, additionalData)); 189 } 190 191 static void pruneBlacklistedCodecs() 192 { 193 for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) { 194 const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]); 195 if (!atomicName) 196 continue; 197 198 Vector<const char*> names; 199 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin(); 200 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end(); 201 for (; it != end; ++it) { 202 if (it->second == atomicName) 203 names.append(it->first); 204 } 205 206 size_t length = names.size(); 207 for (size_t j = 0; j < length; ++j) 208 textEncodingNameMap->remove(names[j]); 209 210 textCodecMap->remove(atomicName); 211 } 212 } 213 214 static void buildBaseTextCodecMaps() 215 { 216 ASSERT(isMainThread()); 217 ASSERT(!textCodecMap); 218 ASSERT(!textEncodingNameMap); 219 220 textCodecMap = new TextCodecMap; 221 textEncodingNameMap = new TextEncodingNameMap; 222 223 TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap); 224 TextCodecLatin1::registerCodecs(addToTextCodecMap); 225 226 TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap); 227 TextCodecUTF8::registerCodecs(addToTextCodecMap); 228 229 TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap); 230 TextCodecUTF16::registerCodecs(addToTextCodecMap); 231 232 TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap); 233 TextCodecUserDefined::registerCodecs(addToTextCodecMap); 234 235 #if USE(GLIB_UNICODE) 236 // FIXME: This is not needed. The code above covers all the base codecs. 237 TextCodecGtk::registerBaseEncodingNames(addToTextEncodingNameMap); 238 TextCodecGtk::registerBaseCodecs(addToTextCodecMap); 239 #endif 240 } 241 242 static void addEncodingName(HashSet<const char*>* set, const char* name) 243 { 244 // We must not use atomicCanonicalTextEncodingName() because this function is called in it. 245 const char* atomicName = textEncodingNameMap->get(name); 246 if (atomicName) 247 set->add(atomicName); 248 } 249 250 static void buildQuirksSets() 251 { 252 // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn() 253 // and initializing the sets for them in TextEncodingRegistry.cpp look strange. 254 255 ASSERT(!japaneseEncodings); 256 ASSERT(!nonBackslashEncodings); 257 258 japaneseEncodings = new HashSet<const char*>; 259 addEncodingName(japaneseEncodings, "EUC-JP"); 260 addEncodingName(japaneseEncodings, "ISO-2022-JP"); 261 addEncodingName(japaneseEncodings, "ISO-2022-JP-1"); 262 addEncodingName(japaneseEncodings, "ISO-2022-JP-2"); 263 addEncodingName(japaneseEncodings, "ISO-2022-JP-3"); 264 addEncodingName(japaneseEncodings, "JIS_C6226-1978"); 265 addEncodingName(japaneseEncodings, "JIS_X0201"); 266 addEncodingName(japaneseEncodings, "JIS_X0208-1983"); 267 addEncodingName(japaneseEncodings, "JIS_X0208-1990"); 268 addEncodingName(japaneseEncodings, "JIS_X0212-1990"); 269 addEncodingName(japaneseEncodings, "Shift_JIS"); 270 addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000"); 271 addEncodingName(japaneseEncodings, "cp932"); 272 addEncodingName(japaneseEncodings, "x-mac-japanese"); 273 274 nonBackslashEncodings = new HashSet<const char*>; 275 // The text encodings below treat backslash as a currency symbol for IE compatibility. 276 // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information. 277 addEncodingName(nonBackslashEncodings, "x-mac-japanese"); 278 addEncodingName(nonBackslashEncodings, "ISO-2022-JP"); 279 addEncodingName(nonBackslashEncodings, "EUC-JP"); 280 // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them. 281 addEncodingName(nonBackslashEncodings, "Shift_JIS"); 282 addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000"); 283 } 284 285 bool isJapaneseEncoding(const char* canonicalEncodingName) 286 { 287 return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName); 288 } 289 290 bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName) 291 { 292 return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName); 293 } 294 295 static void extendTextCodecMaps() 296 { 297 #if USE(ICU_UNICODE) 298 TextCodecICU::registerEncodingNames(addToTextEncodingNameMap); 299 TextCodecICU::registerCodecs(addToTextCodecMap); 300 #endif 301 302 #if USE(QT4_UNICODE) 303 TextCodecQt::registerEncodingNames(addToTextEncodingNameMap); 304 TextCodecQt::registerCodecs(addToTextCodecMap); 305 #endif 306 307 #if PLATFORM(MAC) 308 TextCodecMac::registerEncodingNames(addToTextEncodingNameMap); 309 TextCodecMac::registerCodecs(addToTextCodecMap); 310 #endif 311 312 #if USE(GLIB_UNICODE) 313 TextCodecGtk::registerExtendedEncodingNames(addToTextEncodingNameMap); 314 TextCodecGtk::registerExtendedCodecs(addToTextCodecMap); 315 #endif 316 317 #if OS(WINCE) && !PLATFORM(QT) 318 TextCodecWinCE::registerExtendedEncodingNames(addToTextEncodingNameMap); 319 TextCodecWinCE::registerExtendedCodecs(addToTextCodecMap); 320 #endif 321 322 pruneBlacklistedCodecs(); 323 buildQuirksSets(); 324 } 325 326 PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding) 327 { 328 MutexLocker lock(encodingRegistryMutex()); 329 330 ASSERT(textCodecMap); 331 TextCodecFactory factory = textCodecMap->get(encoding.name()); 332 ASSERT(factory.function); 333 return factory.function(encoding, factory.additionalData); 334 } 335 336 const char* atomicCanonicalTextEncodingName(const char* name) 337 { 338 if (!name || !name[0]) 339 return 0; 340 if (!textEncodingNameMap) 341 buildBaseTextCodecMaps(); 342 343 MutexLocker lock(encodingRegistryMutex()); 344 345 if (const char* atomicName = textEncodingNameMap->get(name)) 346 return atomicName; 347 if (didExtendTextCodecMaps) 348 return 0; 349 extendTextCodecMaps(); 350 didExtendTextCodecMaps = true; 351 return textEncodingNameMap->get(name); 352 } 353 354 const char* atomicCanonicalTextEncodingName(const UChar* characters, size_t length) 355 { 356 char buffer[maxEncodingNameLength + 1]; 357 size_t j = 0; 358 for (size_t i = 0; i < length; ++i) { 359 UChar c = characters[i]; 360 if (j == maxEncodingNameLength) 361 return 0; 362 buffer[j++] = c; 363 } 364 buffer[j] = 0; 365 return atomicCanonicalTextEncodingName(buffer); 366 } 367 368 bool noExtendedTextEncodingNameUsed() 369 { 370 // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value. 371 return !didExtendTextCodecMaps; 372 } 373 374 #ifndef NDEBUG 375 void dumpTextEncodingNameMap() 376 { 377 unsigned size = textEncodingNameMap->size(); 378 fprintf(stderr, "Dumping %u entries in WebCore::textEncodingNameMap...\n", size); 379 380 MutexLocker lock(encodingRegistryMutex()); 381 382 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin(); 383 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end(); 384 for (; it != end; ++it) 385 fprintf(stderr, "'%s' => '%s'\n", it->first, it->second); 386 } 387 #endif 388 389 } // namespace WebCore 390