1 /* 2 * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved. 3 * Copyright (C) 2007-2009 Torch Mobile, Inc. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "config.h" 28 #include "wtf/text/TextEncodingRegistry.h" 29 30 #include "wtf/ASCIICType.h" 31 #include "wtf/CurrentTime.h" 32 #include "wtf/HashMap.h" 33 #include "wtf/HashSet.h" 34 #include "wtf/MainThread.h" 35 #include "wtf/StdLibExtras.h" 36 #include "wtf/StringExtras.h" 37 #include "wtf/ThreadingPrimitives.h" 38 #include "wtf/text/CString.h" 39 #include "wtf/text/TextCodecICU.h" 40 #include "wtf/text/TextCodecLatin1.h" 41 #include "wtf/text/TextCodecUTF16.h" 42 #include "wtf/text/TextCodecUTF8.h" 43 #include "wtf/text/TextCodecUserDefined.h" 44 #include "wtf/text/TextEncoding.h" 45 46 namespace WTF { 47 48 const size_t maxEncodingNameLength = 63; 49 50 // Hash for all-ASCII strings that does case folding. 51 struct TextEncodingNameHash { 52 static bool equal(const char* s1, const char* s2) 53 { 54 char c1; 55 char c2; 56 do { 57 #if defined(_MSC_FULL_VER) && _MSC_FULL_VER == 170051106 58 // Workaround for a bug in the VS2012 Update 1 optimizer, remove once the fix is released. 59 // https://connect.microsoft.com/VisualStudio/feedback/details/777533/vs2012-c-optimizing-bug-when-using-inline-and-char-return-type-x86-target-only 60 c1 = toASCIILower(*s1++); 61 c2 = toASCIILower(*s2++); 62 if (c1 != c2) 63 return false; 64 #else 65 c1 = *s1++; 66 c2 = *s2++; 67 if (toASCIILower(c1) != toASCIILower(c2)) 68 return false; 69 #endif 70 } while (c1 && c2); 71 return !c1 && !c2; 72 } 73 74 // This algorithm is the one-at-a-time hash from: 75 // http://burtleburtle.net/bob/hash/hashfaq.html 76 // http://burtleburtle.net/bob/hash/doobs.html 77 static unsigned hash(const char* s) 78 { 79 unsigned h = WTF::stringHashingStartValue; 80 for (;;) { 81 char c = *s++; 82 if (!c) { 83 h += (h << 3); 84 h ^= (h >> 11); 85 h += (h << 15); 86 return h; 87 } 88 h += toASCIILower(c); 89 h += (h << 10); 90 h ^= (h >> 6); 91 } 92 } 93 94 static const bool safeToCompareToEmptyOrDeleted = false; 95 }; 96 97 struct TextCodecFactory { 98 NewTextCodecFunction function; 99 const void* additionalData; 100 TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { } 101 }; 102 103 typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap; 104 typedef HashMap<const char*, TextCodecFactory> TextCodecMap; 105 106 static Mutex& encodingRegistryMutex() 107 { 108 // We don't have to use AtomicallyInitializedStatic here because 109 // this function is called on the main thread for any page before 110 // it is used in worker threads. 111 DEFINE_STATIC_LOCAL(Mutex, mutex, ()); 112 return mutex; 113 } 114 115 static TextEncodingNameMap* textEncodingNameMap; 116 static TextCodecMap* textCodecMap; 117 static bool didExtendTextCodecMaps; 118 static HashSet<const char*>* japaneseEncodings; 119 static HashSet<const char*>* nonBackslashEncodings; 120 121 static const char textEncodingNameBlacklist[][6] = { "UTF-7" }; 122 123 #if ERROR_DISABLED 124 125 static inline void checkExistingName(const char*, const char*) { } 126 127 #else 128 129 static void checkExistingName(const char* alias, const char* atomicName) 130 { 131 const char* oldAtomicName = textEncodingNameMap->get(alias); 132 if (!oldAtomicName) 133 return; 134 if (oldAtomicName == atomicName) 135 return; 136 // Keep the warning silent about one case where we know this will happen. 137 if (strcmp(alias, "ISO-8859-8-I") == 0 138 && strcmp(oldAtomicName, "ISO-8859-8-I") == 0 139 && strcasecmp(atomicName, "iso-8859-8") == 0) 140 return; 141 WTF_LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName); 142 } 143 144 #endif 145 146 static bool isUndesiredAlias(const char* alias) 147 { 148 // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU). 149 for (const char* p = alias; *p; ++p) { 150 if (*p == ',') 151 return true; 152 } 153 // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility 154 // problem, see bug 43554. 155 if (0 == strcmp(alias, "8859_1")) 156 return true; 157 return false; 158 } 159 160 static void addToTextEncodingNameMap(const char* alias, const char* name) 161 { 162 ASSERT(strlen(alias) <= maxEncodingNameLength); 163 if (isUndesiredAlias(alias)) 164 return; 165 const char* atomicName = textEncodingNameMap->get(name); 166 ASSERT(strcmp(alias, name) == 0 || atomicName); 167 if (!atomicName) 168 atomicName = name; 169 checkExistingName(alias, atomicName); 170 textEncodingNameMap->add(alias, atomicName); 171 } 172 173 static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData) 174 { 175 const char* atomicName = textEncodingNameMap->get(name); 176 ASSERT(atomicName); 177 textCodecMap->add(atomicName, TextCodecFactory(function, additionalData)); 178 } 179 180 static void pruneBlacklistedCodecs() 181 { 182 for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) { 183 const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]); 184 if (!atomicName) 185 continue; 186 187 Vector<const char*> names; 188 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin(); 189 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end(); 190 for (; it != end; ++it) { 191 if (it->value == atomicName) 192 names.append(it->key); 193 } 194 195 size_t length = names.size(); 196 for (size_t j = 0; j < length; ++j) 197 textEncodingNameMap->remove(names[j]); 198 199 textCodecMap->remove(atomicName); 200 } 201 } 202 203 static void buildBaseTextCodecMaps() 204 { 205 ASSERT(isMainThread()); 206 ASSERT(!textCodecMap); 207 ASSERT(!textEncodingNameMap); 208 209 textCodecMap = new TextCodecMap; 210 textEncodingNameMap = new TextEncodingNameMap; 211 212 TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap); 213 TextCodecLatin1::registerCodecs(addToTextCodecMap); 214 215 TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap); 216 TextCodecUTF8::registerCodecs(addToTextCodecMap); 217 218 TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap); 219 TextCodecUTF16::registerCodecs(addToTextCodecMap); 220 221 TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap); 222 TextCodecUserDefined::registerCodecs(addToTextCodecMap); 223 } 224 225 static void addEncodingName(HashSet<const char*>* set, const char* name) 226 { 227 // We must not use atomicCanonicalTextEncodingName() because this function is called in it. 228 const char* atomicName = textEncodingNameMap->get(name); 229 if (atomicName) 230 set->add(atomicName); 231 } 232 233 static void buildQuirksSets() 234 { 235 // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn() 236 // and initializing the sets for them in TextEncodingRegistry.cpp look strange. 237 238 ASSERT(!japaneseEncodings); 239 ASSERT(!nonBackslashEncodings); 240 241 japaneseEncodings = new HashSet<const char*>; 242 addEncodingName(japaneseEncodings, "EUC-JP"); 243 addEncodingName(japaneseEncodings, "ISO-2022-JP"); 244 addEncodingName(japaneseEncodings, "ISO-2022-JP-1"); 245 addEncodingName(japaneseEncodings, "ISO-2022-JP-2"); 246 addEncodingName(japaneseEncodings, "ISO-2022-JP-3"); 247 addEncodingName(japaneseEncodings, "JIS_C6226-1978"); 248 addEncodingName(japaneseEncodings, "JIS_X0201"); 249 addEncodingName(japaneseEncodings, "JIS_X0208-1983"); 250 addEncodingName(japaneseEncodings, "JIS_X0208-1990"); 251 addEncodingName(japaneseEncodings, "JIS_X0212-1990"); 252 addEncodingName(japaneseEncodings, "Shift_JIS"); 253 addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000"); 254 addEncodingName(japaneseEncodings, "cp932"); 255 addEncodingName(japaneseEncodings, "x-mac-japanese"); 256 257 nonBackslashEncodings = new HashSet<const char*>; 258 // The text encodings below treat backslash as a currency symbol for IE compatibility. 259 // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information. 260 addEncodingName(nonBackslashEncodings, "x-mac-japanese"); 261 addEncodingName(nonBackslashEncodings, "ISO-2022-JP"); 262 addEncodingName(nonBackslashEncodings, "EUC-JP"); 263 // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them. 264 addEncodingName(nonBackslashEncodings, "Shift_JIS"); 265 addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000"); 266 } 267 268 bool isJapaneseEncoding(const char* canonicalEncodingName) 269 { 270 return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName); 271 } 272 273 bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName) 274 { 275 return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName); 276 } 277 278 static void extendTextCodecMaps() 279 { 280 TextCodecICU::registerEncodingNames(addToTextEncodingNameMap); 281 TextCodecICU::registerCodecs(addToTextCodecMap); 282 283 pruneBlacklistedCodecs(); 284 buildQuirksSets(); 285 } 286 287 PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding) 288 { 289 MutexLocker lock(encodingRegistryMutex()); 290 291 ASSERT(textCodecMap); 292 TextCodecFactory factory = textCodecMap->get(encoding.name()); 293 ASSERT(factory.function); 294 return factory.function(encoding, factory.additionalData); 295 } 296 297 const char* atomicCanonicalTextEncodingName(const char* name) 298 { 299 if (!name || !name[0]) 300 return 0; 301 if (!textEncodingNameMap) 302 buildBaseTextCodecMaps(); 303 304 MutexLocker lock(encodingRegistryMutex()); 305 306 if (const char* atomicName = textEncodingNameMap->get(name)) 307 return atomicName; 308 if (didExtendTextCodecMaps) 309 return 0; 310 extendTextCodecMaps(); 311 didExtendTextCodecMaps = true; 312 return textEncodingNameMap->get(name); 313 } 314 315 template <typename CharacterType> 316 const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length) 317 { 318 char buffer[maxEncodingNameLength + 1]; 319 size_t j = 0; 320 for (size_t i = 0; i < length; ++i) { 321 CharacterType c = characters[i]; 322 if (j == maxEncodingNameLength) 323 return 0; 324 buffer[j++] = c; 325 } 326 buffer[j] = 0; 327 return atomicCanonicalTextEncodingName(buffer); 328 } 329 330 const char* atomicCanonicalTextEncodingName(const String& alias) 331 { 332 if (!alias.length()) 333 return 0; 334 335 if (alias.is8Bit()) 336 return atomicCanonicalTextEncodingName<LChar>(alias.characters8(), alias.length()); 337 338 return atomicCanonicalTextEncodingName<UChar>(alias.characters16(), alias.length()); 339 } 340 341 bool noExtendedTextEncodingNameUsed() 342 { 343 // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value. 344 return !didExtendTextCodecMaps; 345 } 346 347 #ifndef NDEBUG 348 void dumpTextEncodingNameMap() 349 { 350 unsigned size = textEncodingNameMap->size(); 351 fprintf(stderr, "Dumping %u entries in WTF::TextEncodingNameMap...\n", size); 352 353 MutexLocker lock(encodingRegistryMutex()); 354 355 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin(); 356 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end(); 357 for (; it != end; ++it) 358 fprintf(stderr, "'%s' => '%s'\n", it->key, it->value); 359 } 360 #endif 361 362 } // namespace WTF 363