1 /* 2 * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved. 3 * Copyright (C) 2007-2009 Torch Mobile, Inc. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "config.h" 28 #include "wtf/text/TextEncodingRegistry.h" 29 30 #include "wtf/ASCIICType.h" 31 #include "wtf/CurrentTime.h" 32 #include "wtf/HashMap.h" 33 #include "wtf/HashSet.h" 34 #include "wtf/MainThread.h" 35 #include "wtf/StdLibExtras.h" 36 #include "wtf/StringExtras.h" 37 #include "wtf/text/CString.h" 38 #include "wtf/text/TextCodecICU.h" 39 #include "wtf/text/TextCodecLatin1.h" 40 #include "wtf/text/TextCodecUTF16.h" 41 #include "wtf/text/TextCodecUTF8.h" 42 #include "wtf/text/TextCodecUserDefined.h" 43 #include "wtf/text/TextEncoding.h" 44 45 namespace WTF { 46 47 const size_t maxEncodingNameLength = 63; 48 49 // Hash for all-ASCII strings that does case folding. 50 struct TextEncodingNameHash { 51 static bool equal(const char* s1, const char* s2) 52 { 53 char c1; 54 char c2; 55 do { 56 #if defined(_MSC_FULL_VER) && _MSC_FULL_VER == 170051106 57 // Workaround for a bug in the VS2012 Update 1 optimizer, remove once the fix is released. 58 // https://connect.microsoft.com/VisualStudio/feedback/details/777533/vs2012-c-optimizing-bug-when-using-inline-and-char-return-type-x86-target-only 59 c1 = toASCIILower(*s1++); 60 c2 = toASCIILower(*s2++); 61 if (c1 != c2) 62 return false; 63 #else 64 c1 = *s1++; 65 c2 = *s2++; 66 if (toASCIILower(c1) != toASCIILower(c2)) 67 return false; 68 #endif 69 } while (c1 && c2); 70 return !c1 && !c2; 71 } 72 73 // This algorithm is the one-at-a-time hash from: 74 // http://burtleburtle.net/bob/hash/hashfaq.html 75 // http://burtleburtle.net/bob/hash/doobs.html 76 static unsigned hash(const char* s) 77 { 78 unsigned h = WTF::stringHashingStartValue; 79 for (;;) { 80 char c = *s++; 81 if (!c) { 82 h += (h << 3); 83 h ^= (h >> 11); 84 h += (h << 15); 85 return h; 86 } 87 h += toASCIILower(c); 88 h += (h << 10); 89 h ^= (h >> 6); 90 } 91 } 92 93 static const bool safeToCompareToEmptyOrDeleted = false; 94 }; 95 96 struct TextCodecFactory { 97 NewTextCodecFunction function; 98 const void* additionalData; 99 TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { } 100 }; 101 102 typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap; 103 typedef HashMap<const char*, TextCodecFactory> TextCodecMap; 104 105 static Mutex& encodingRegistryMutex() 106 { 107 // We don't have to use AtomicallyInitializedStatic here because 108 // this function is called on the main thread for any page before 109 // it is used in worker threads. 110 DEFINE_STATIC_LOCAL(Mutex, mutex, ()); 111 return mutex; 112 } 113 114 static TextEncodingNameMap* textEncodingNameMap; 115 static TextCodecMap* textCodecMap; 116 static bool didExtendTextCodecMaps; 117 static HashSet<const char*>* japaneseEncodings; 118 static HashSet<const char*>* nonBackslashEncodings; 119 120 static const char* const textEncodingNameBlacklist[] = { "UTF-7" }; 121 122 #if ERROR_DISABLED 123 124 static inline void checkExistingName(const char*, const char*) { } 125 126 #else 127 128 static void checkExistingName(const char* alias, const char* atomicName) 129 { 130 const char* oldAtomicName = textEncodingNameMap->get(alias); 131 if (!oldAtomicName) 132 return; 133 if (oldAtomicName == atomicName) 134 return; 135 // Keep the warning silent about one case where we know this will happen. 136 if (strcmp(alias, "ISO-8859-8-I") == 0 137 && strcmp(oldAtomicName, "ISO-8859-8-I") == 0 138 && strcasecmp(atomicName, "iso-8859-8") == 0) 139 return; 140 LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName); 141 } 142 143 #endif 144 145 static bool isUndesiredAlias(const char* alias) 146 { 147 // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU). 148 for (const char* p = alias; *p; ++p) { 149 if (*p == ',') 150 return true; 151 } 152 // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility 153 // problem, see bug 43554. 154 if (0 == strcmp(alias, "8859_1")) 155 return true; 156 return false; 157 } 158 159 static void addToTextEncodingNameMap(const char* alias, const char* name) 160 { 161 ASSERT(strlen(alias) <= maxEncodingNameLength); 162 if (isUndesiredAlias(alias)) 163 return; 164 const char* atomicName = textEncodingNameMap->get(name); 165 ASSERT(strcmp(alias, name) == 0 || atomicName); 166 if (!atomicName) 167 atomicName = name; 168 checkExistingName(alias, atomicName); 169 textEncodingNameMap->add(alias, atomicName); 170 } 171 172 static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData) 173 { 174 const char* atomicName = textEncodingNameMap->get(name); 175 ASSERT(atomicName); 176 textCodecMap->add(atomicName, TextCodecFactory(function, additionalData)); 177 } 178 179 static void pruneBlacklistedCodecs() 180 { 181 for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) { 182 const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]); 183 if (!atomicName) 184 continue; 185 186 Vector<const char*> names; 187 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin(); 188 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end(); 189 for (; it != end; ++it) { 190 if (it->value == atomicName) 191 names.append(it->key); 192 } 193 194 size_t length = names.size(); 195 for (size_t j = 0; j < length; ++j) 196 textEncodingNameMap->remove(names[j]); 197 198 textCodecMap->remove(atomicName); 199 } 200 } 201 202 static void buildBaseTextCodecMaps() 203 { 204 ASSERT(isMainThread()); 205 ASSERT(!textCodecMap); 206 ASSERT(!textEncodingNameMap); 207 208 textCodecMap = new TextCodecMap; 209 textEncodingNameMap = new TextEncodingNameMap; 210 211 TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap); 212 TextCodecLatin1::registerCodecs(addToTextCodecMap); 213 214 TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap); 215 TextCodecUTF8::registerCodecs(addToTextCodecMap); 216 217 TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap); 218 TextCodecUTF16::registerCodecs(addToTextCodecMap); 219 220 TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap); 221 TextCodecUserDefined::registerCodecs(addToTextCodecMap); 222 } 223 224 static void addEncodingName(HashSet<const char*>* set, const char* name) 225 { 226 // We must not use atomicCanonicalTextEncodingName() because this function is called in it. 227 const char* atomicName = textEncodingNameMap->get(name); 228 if (atomicName) 229 set->add(atomicName); 230 } 231 232 static void buildQuirksSets() 233 { 234 // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn() 235 // and initializing the sets for them in TextEncodingRegistry.cpp look strange. 236 237 ASSERT(!japaneseEncodings); 238 ASSERT(!nonBackslashEncodings); 239 240 japaneseEncodings = new HashSet<const char*>; 241 addEncodingName(japaneseEncodings, "EUC-JP"); 242 addEncodingName(japaneseEncodings, "ISO-2022-JP"); 243 addEncodingName(japaneseEncodings, "ISO-2022-JP-1"); 244 addEncodingName(japaneseEncodings, "ISO-2022-JP-2"); 245 addEncodingName(japaneseEncodings, "ISO-2022-JP-3"); 246 addEncodingName(japaneseEncodings, "JIS_C6226-1978"); 247 addEncodingName(japaneseEncodings, "JIS_X0201"); 248 addEncodingName(japaneseEncodings, "JIS_X0208-1983"); 249 addEncodingName(japaneseEncodings, "JIS_X0208-1990"); 250 addEncodingName(japaneseEncodings, "JIS_X0212-1990"); 251 addEncodingName(japaneseEncodings, "Shift_JIS"); 252 addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000"); 253 addEncodingName(japaneseEncodings, "cp932"); 254 addEncodingName(japaneseEncodings, "x-mac-japanese"); 255 256 nonBackslashEncodings = new HashSet<const char*>; 257 // The text encodings below treat backslash as a currency symbol for IE compatibility. 258 // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information. 259 addEncodingName(nonBackslashEncodings, "x-mac-japanese"); 260 addEncodingName(nonBackslashEncodings, "ISO-2022-JP"); 261 addEncodingName(nonBackslashEncodings, "EUC-JP"); 262 // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them. 263 addEncodingName(nonBackslashEncodings, "Shift_JIS"); 264 addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000"); 265 } 266 267 bool isJapaneseEncoding(const char* canonicalEncodingName) 268 { 269 return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName); 270 } 271 272 bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName) 273 { 274 return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName); 275 } 276 277 static void extendTextCodecMaps() 278 { 279 TextCodecICU::registerEncodingNames(addToTextEncodingNameMap); 280 TextCodecICU::registerCodecs(addToTextCodecMap); 281 282 pruneBlacklistedCodecs(); 283 buildQuirksSets(); 284 } 285 286 PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding) 287 { 288 MutexLocker lock(encodingRegistryMutex()); 289 290 ASSERT(textCodecMap); 291 TextCodecFactory factory = textCodecMap->get(encoding.name()); 292 ASSERT(factory.function); 293 return factory.function(encoding, factory.additionalData); 294 } 295 296 const char* atomicCanonicalTextEncodingName(const char* name) 297 { 298 if (!name || !name[0]) 299 return 0; 300 if (!textEncodingNameMap) 301 buildBaseTextCodecMaps(); 302 303 MutexLocker lock(encodingRegistryMutex()); 304 305 if (const char* atomicName = textEncodingNameMap->get(name)) 306 return atomicName; 307 if (didExtendTextCodecMaps) 308 return 0; 309 extendTextCodecMaps(); 310 didExtendTextCodecMaps = true; 311 return textEncodingNameMap->get(name); 312 } 313 314 template <typename CharacterType> 315 const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length) 316 { 317 char buffer[maxEncodingNameLength + 1]; 318 size_t j = 0; 319 for (size_t i = 0; i < length; ++i) { 320 CharacterType c = characters[i]; 321 if (j == maxEncodingNameLength) 322 return 0; 323 buffer[j++] = c; 324 } 325 buffer[j] = 0; 326 return atomicCanonicalTextEncodingName(buffer); 327 } 328 329 const char* atomicCanonicalTextEncodingName(const String& alias) 330 { 331 if (!alias.length()) 332 return 0; 333 334 if (alias.is8Bit()) 335 return atomicCanonicalTextEncodingName<LChar>(alias.characters8(), alias.length()); 336 337 return atomicCanonicalTextEncodingName<UChar>(alias.characters16(), alias.length()); 338 } 339 340 bool noExtendedTextEncodingNameUsed() 341 { 342 // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value. 343 return !didExtendTextCodecMaps; 344 } 345 346 #ifndef NDEBUG 347 void dumpTextEncodingNameMap() 348 { 349 unsigned size = textEncodingNameMap->size(); 350 fprintf(stderr, "Dumping %u entries in WTF::TextEncodingNameMap...\n", size); 351 352 MutexLocker lock(encodingRegistryMutex()); 353 354 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin(); 355 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end(); 356 for (; it != end; ++it) 357 fprintf(stderr, "'%s' => '%s'\n", it->key, it->value); 358 } 359 #endif 360 361 } // namespace WTF 362