1 /* 2 * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved. 3 * Copyright (C) 2007-2009 Torch Mobile, Inc. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "config.h" 28 #include "wtf/text/TextEncodingRegistry.h" 29 30 #include "wtf/ASCIICType.h" 31 #include "wtf/CurrentTime.h" 32 #include "wtf/HashMap.h" 33 #include "wtf/HashSet.h" 34 #include "wtf/MainThread.h" 35 #include "wtf/StdLibExtras.h" 36 #include "wtf/StringExtras.h" 37 #include "wtf/ThreadingPrimitives.h" 38 #include "wtf/text/CString.h" 39 #include "wtf/text/TextCodecICU.h" 40 #include "wtf/text/TextCodecLatin1.h" 41 #include "wtf/text/TextCodecReplacement.h" 42 #include "wtf/text/TextCodecUTF16.h" 43 #include "wtf/text/TextCodecUTF8.h" 44 #include "wtf/text/TextCodecUserDefined.h" 45 #include "wtf/text/TextEncoding.h" 46 47 namespace WTF { 48 49 const size_t maxEncodingNameLength = 63; 50 51 // Hash for all-ASCII strings that does case folding. 52 struct TextEncodingNameHash { 53 static bool equal(const char* s1, const char* s2) 54 { 55 char c1; 56 char c2; 57 do { 58 #if defined(_MSC_FULL_VER) && _MSC_FULL_VER == 170051106 59 // Workaround for a bug in the VS2012 Update 1 optimizer, remove once the fix is released. 60 // https://connect.microsoft.com/VisualStudio/feedback/details/777533/vs2012-c-optimizing-bug-when-using-inline-and-char-return-type-x86-target-only 61 c1 = toASCIILower(*s1++); 62 c2 = toASCIILower(*s2++); 63 if (c1 != c2) 64 return false; 65 #else 66 c1 = *s1++; 67 c2 = *s2++; 68 if (toASCIILower(c1) != toASCIILower(c2)) 69 return false; 70 #endif 71 } while (c1 && c2); 72 return !c1 && !c2; 73 } 74 75 // This algorithm is the one-at-a-time hash from: 76 // http://burtleburtle.net/bob/hash/hashfaq.html 77 // http://burtleburtle.net/bob/hash/doobs.html 78 static unsigned hash(const char* s) 79 { 80 unsigned h = WTF::stringHashingStartValue; 81 for (;;) { 82 char c = *s++; 83 if (!c) { 84 h += (h << 3); 85 h ^= (h >> 11); 86 h += (h << 15); 87 return h; 88 } 89 h += toASCIILower(c); 90 h += (h << 10); 91 h ^= (h >> 6); 92 } 93 } 94 95 static const bool safeToCompareToEmptyOrDeleted = false; 96 }; 97 98 struct TextCodecFactory { 99 NewTextCodecFunction function; 100 const void* additionalData; 101 TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { } 102 }; 103 104 typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap; 105 typedef HashMap<const char*, TextCodecFactory> TextCodecMap; 106 107 static Mutex& encodingRegistryMutex() 108 { 109 // We don't have to use AtomicallyInitializedStatic here because 110 // this function is called on the main thread for any page before 111 // it is used in worker threads. 112 DEFINE_STATIC_LOCAL(Mutex, mutex, ()); 113 return mutex; 114 } 115 116 static TextEncodingNameMap* textEncodingNameMap; 117 static TextCodecMap* textCodecMap; 118 static bool didExtendTextCodecMaps; 119 120 static const char textEncodingNameBlacklist[][6] = { "UTF-7" }; 121 122 #if ERROR_DISABLED 123 124 static inline void checkExistingName(const char*, const char*) { } 125 126 #else 127 128 static void checkExistingName(const char* alias, const char* atomicName) 129 { 130 const char* oldAtomicName = textEncodingNameMap->get(alias); 131 if (!oldAtomicName) 132 return; 133 if (oldAtomicName == atomicName) 134 return; 135 // Keep the warning silent about one case where we know this will happen. 136 if (strcmp(alias, "ISO-8859-8-I") == 0 137 && strcmp(oldAtomicName, "ISO-8859-8-I") == 0 138 && strcasecmp(atomicName, "iso-8859-8") == 0) 139 return; 140 WTF_LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName); 141 } 142 143 #endif 144 145 static bool isUndesiredAlias(const char* alias) 146 { 147 // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU). 148 for (const char* p = alias; *p; ++p) { 149 if (*p == ',') 150 return true; 151 } 152 // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility 153 // problem, see bug 43554. 154 if (0 == strcmp(alias, "8859_1")) 155 return true; 156 return false; 157 } 158 159 static void addToTextEncodingNameMap(const char* alias, const char* name) 160 { 161 ASSERT(strlen(alias) <= maxEncodingNameLength); 162 if (isUndesiredAlias(alias)) 163 return; 164 const char* atomicName = textEncodingNameMap->get(name); 165 ASSERT(strcmp(alias, name) == 0 || atomicName); 166 if (!atomicName) 167 atomicName = name; 168 checkExistingName(alias, atomicName); 169 textEncodingNameMap->add(alias, atomicName); 170 } 171 172 static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData) 173 { 174 const char* atomicName = textEncodingNameMap->get(name); 175 ASSERT(atomicName); 176 textCodecMap->add(atomicName, TextCodecFactory(function, additionalData)); 177 } 178 179 static void pruneBlacklistedCodecs() 180 { 181 for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) { 182 const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]); 183 if (!atomicName) 184 continue; 185 186 Vector<const char*> names; 187 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin(); 188 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end(); 189 for (; it != end; ++it) { 190 if (it->value == atomicName) 191 names.append(it->key); 192 } 193 194 textEncodingNameMap->removeAll(names); 195 196 textCodecMap->remove(atomicName); 197 } 198 } 199 200 static void buildBaseTextCodecMaps() 201 { 202 ASSERT(isMainThread()); 203 ASSERT(!textCodecMap); 204 ASSERT(!textEncodingNameMap); 205 206 textCodecMap = new TextCodecMap; 207 textEncodingNameMap = new TextEncodingNameMap; 208 209 TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap); 210 TextCodecLatin1::registerCodecs(addToTextCodecMap); 211 212 TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap); 213 TextCodecUTF8::registerCodecs(addToTextCodecMap); 214 215 TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap); 216 TextCodecUTF16::registerCodecs(addToTextCodecMap); 217 218 TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap); 219 TextCodecUserDefined::registerCodecs(addToTextCodecMap); 220 } 221 222 bool isReplacementEncoding(const char* alias) 223 { 224 return alias && !strcasecmp(alias, "replacement"); 225 } 226 227 bool isReplacementEncoding(const String& alias) 228 { 229 return alias == "replacement"; 230 } 231 232 static void extendTextCodecMaps() 233 { 234 TextCodecReplacement::registerEncodingNames(addToTextEncodingNameMap); 235 TextCodecReplacement::registerCodecs(addToTextCodecMap); 236 237 TextCodecICU::registerEncodingNames(addToTextEncodingNameMap); 238 TextCodecICU::registerCodecs(addToTextCodecMap); 239 240 pruneBlacklistedCodecs(); 241 } 242 243 PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding) 244 { 245 MutexLocker lock(encodingRegistryMutex()); 246 247 ASSERT(textCodecMap); 248 TextCodecFactory factory = textCodecMap->get(encoding.name()); 249 ASSERT(factory.function); 250 return factory.function(encoding, factory.additionalData); 251 } 252 253 const char* atomicCanonicalTextEncodingName(const char* name) 254 { 255 if (!name || !name[0]) 256 return 0; 257 if (!textEncodingNameMap) 258 buildBaseTextCodecMaps(); 259 260 MutexLocker lock(encodingRegistryMutex()); 261 262 if (const char* atomicName = textEncodingNameMap->get(name)) 263 return atomicName; 264 if (didExtendTextCodecMaps) 265 return 0; 266 extendTextCodecMaps(); 267 didExtendTextCodecMaps = true; 268 return textEncodingNameMap->get(name); 269 } 270 271 template <typename CharacterType> 272 const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length) 273 { 274 char buffer[maxEncodingNameLength + 1]; 275 size_t j = 0; 276 for (size_t i = 0; i < length; ++i) { 277 CharacterType c = characters[i]; 278 if (j == maxEncodingNameLength) 279 return 0; 280 buffer[j++] = c; 281 } 282 buffer[j] = 0; 283 return atomicCanonicalTextEncodingName(buffer); 284 } 285 286 const char* atomicCanonicalTextEncodingName(const String& alias) 287 { 288 if (!alias.length()) 289 return 0; 290 291 if (alias.is8Bit()) 292 return atomicCanonicalTextEncodingName<LChar>(alias.characters8(), alias.length()); 293 294 return atomicCanonicalTextEncodingName<UChar>(alias.characters16(), alias.length()); 295 } 296 297 bool noExtendedTextEncodingNameUsed() 298 { 299 // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value. 300 return !didExtendTextCodecMaps; 301 } 302 303 #ifndef NDEBUG 304 void dumpTextEncodingNameMap() 305 { 306 unsigned size = textEncodingNameMap->size(); 307 fprintf(stderr, "Dumping %u entries in WTF::TextEncodingNameMap...\n", size); 308 309 MutexLocker lock(encodingRegistryMutex()); 310 311 TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin(); 312 TextEncodingNameMap::const_iterator end = textEncodingNameMap->end(); 313 for (; it != end; ++it) 314 fprintf(stderr, "'%s' => '%s'\n", it->key, it->value); 315 } 316 #endif 317 318 } // namespace WTF 319