Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
      3  * Copyright (C) 2007-2009 Torch Mobile, Inc.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 #include "config.h"
     28 #include "wtf/text/TextEncodingRegistry.h"
     29 
     30 #include "wtf/ASCIICType.h"
     31 #include "wtf/CurrentTime.h"
     32 #include "wtf/HashMap.h"
     33 #include "wtf/HashSet.h"
     34 #include "wtf/MainThread.h"
     35 #include "wtf/StdLibExtras.h"
     36 #include "wtf/StringExtras.h"
     37 #include "wtf/ThreadingPrimitives.h"
     38 #include "wtf/text/CString.h"
     39 #include "wtf/text/TextCodecICU.h"
     40 #include "wtf/text/TextCodecLatin1.h"
     41 #include "wtf/text/TextCodecReplacement.h"
     42 #include "wtf/text/TextCodecUTF16.h"
     43 #include "wtf/text/TextCodecUTF8.h"
     44 #include "wtf/text/TextCodecUserDefined.h"
     45 #include "wtf/text/TextEncoding.h"
     46 
     47 namespace WTF {
     48 
     49 const size_t maxEncodingNameLength = 63;
     50 
     51 // Hash for all-ASCII strings that does case folding.
     52 struct TextEncodingNameHash {
     53     static bool equal(const char* s1, const char* s2)
     54     {
     55         char c1;
     56         char c2;
     57         do {
     58 #if defined(_MSC_FULL_VER) && _MSC_FULL_VER == 170051106
     59             // Workaround for a bug in the VS2012 Update 1 optimizer, remove once the fix is released.
     60             // https://connect.microsoft.com/VisualStudio/feedback/details/777533/vs2012-c-optimizing-bug-when-using-inline-and-char-return-type-x86-target-only
     61             c1 = toASCIILower(*s1++);
     62             c2 = toASCIILower(*s2++);
     63             if (c1 != c2)
     64                 return false;
     65 #else
     66             c1 = *s1++;
     67             c2 = *s2++;
     68             if (toASCIILower(c1) != toASCIILower(c2))
     69                 return false;
     70 #endif
     71         } while (c1 && c2);
     72         return !c1 && !c2;
     73     }
     74 
     75     // This algorithm is the one-at-a-time hash from:
     76     // http://burtleburtle.net/bob/hash/hashfaq.html
     77     // http://burtleburtle.net/bob/hash/doobs.html
     78     static unsigned hash(const char* s)
     79     {
     80         unsigned h = WTF::stringHashingStartValue;
     81         for (;;) {
     82             char c = *s++;
     83             if (!c) {
     84                 h += (h << 3);
     85                 h ^= (h >> 11);
     86                 h += (h << 15);
     87                 return h;
     88             }
     89             h += toASCIILower(c);
     90             h += (h << 10);
     91             h ^= (h >> 6);
     92         }
     93     }
     94 
     95     static const bool safeToCompareToEmptyOrDeleted = false;
     96 };
     97 
     98 struct TextCodecFactory {
     99     NewTextCodecFunction function;
    100     const void* additionalData;
    101     TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { }
    102 };
    103 
    104 typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap;
    105 typedef HashMap<const char*, TextCodecFactory> TextCodecMap;
    106 
    107 static Mutex& encodingRegistryMutex()
    108 {
    109     // We don't have to use AtomicallyInitializedStatic here because
    110     // this function is called on the main thread for any page before
    111     // it is used in worker threads.
    112     DEFINE_STATIC_LOCAL(Mutex, mutex, ());
    113     return mutex;
    114 }
    115 
    116 static TextEncodingNameMap* textEncodingNameMap;
    117 static TextCodecMap* textCodecMap;
    118 static bool didExtendTextCodecMaps;
    119 
    120 static const char textEncodingNameBlacklist[][6] = { "UTF-7" };
    121 
    122 #if ERROR_DISABLED
    123 
    124 static inline void checkExistingName(const char*, const char*) { }
    125 
    126 #else
    127 
    128 static void checkExistingName(const char* alias, const char* atomicName)
    129 {
    130     const char* oldAtomicName = textEncodingNameMap->get(alias);
    131     if (!oldAtomicName)
    132         return;
    133     if (oldAtomicName == atomicName)
    134         return;
    135     // Keep the warning silent about one case where we know this will happen.
    136     if (strcmp(alias, "ISO-8859-8-I") == 0
    137             && strcmp(oldAtomicName, "ISO-8859-8-I") == 0
    138             && strcasecmp(atomicName, "iso-8859-8") == 0)
    139         return;
    140     WTF_LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
    141 }
    142 
    143 #endif
    144 
    145 static bool isUndesiredAlias(const char* alias)
    146 {
    147     // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
    148     for (const char* p = alias; *p; ++p) {
    149         if (*p == ',')
    150             return true;
    151     }
    152     // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
    153     // problem, see bug 43554.
    154     if (0 == strcmp(alias, "8859_1"))
    155         return true;
    156     return false;
    157 }
    158 
    159 static void addToTextEncodingNameMap(const char* alias, const char* name)
    160 {
    161     ASSERT(strlen(alias) <= maxEncodingNameLength);
    162     if (isUndesiredAlias(alias))
    163         return;
    164     const char* atomicName = textEncodingNameMap->get(name);
    165     ASSERT(strcmp(alias, name) == 0 || atomicName);
    166     if (!atomicName)
    167         atomicName = name;
    168     checkExistingName(alias, atomicName);
    169     textEncodingNameMap->add(alias, atomicName);
    170 }
    171 
    172 static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData)
    173 {
    174     const char* atomicName = textEncodingNameMap->get(name);
    175     ASSERT(atomicName);
    176     textCodecMap->add(atomicName, TextCodecFactory(function, additionalData));
    177 }
    178 
    179 static void pruneBlacklistedCodecs()
    180 {
    181     for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) {
    182         const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]);
    183         if (!atomicName)
    184             continue;
    185 
    186         Vector<const char*> names;
    187         TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
    188         TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
    189         for (; it != end; ++it) {
    190             if (it->value == atomicName)
    191                 names.append(it->key);
    192         }
    193 
    194         textEncodingNameMap->removeAll(names);
    195 
    196         textCodecMap->remove(atomicName);
    197     }
    198 }
    199 
    200 static void buildBaseTextCodecMaps()
    201 {
    202     ASSERT(isMainThread());
    203     ASSERT(!textCodecMap);
    204     ASSERT(!textEncodingNameMap);
    205 
    206     textCodecMap = new TextCodecMap;
    207     textEncodingNameMap = new TextEncodingNameMap;
    208 
    209     TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
    210     TextCodecLatin1::registerCodecs(addToTextCodecMap);
    211 
    212     TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
    213     TextCodecUTF8::registerCodecs(addToTextCodecMap);
    214 
    215     TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
    216     TextCodecUTF16::registerCodecs(addToTextCodecMap);
    217 
    218     TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
    219     TextCodecUserDefined::registerCodecs(addToTextCodecMap);
    220 }
    221 
    222 bool isReplacementEncoding(const char* alias)
    223 {
    224     return alias && !strcasecmp(alias, "replacement");
    225 }
    226 
    227 bool isReplacementEncoding(const String& alias)
    228 {
    229     return alias == "replacement";
    230 }
    231 
    232 static void extendTextCodecMaps()
    233 {
    234     TextCodecReplacement::registerEncodingNames(addToTextEncodingNameMap);
    235     TextCodecReplacement::registerCodecs(addToTextCodecMap);
    236 
    237     TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
    238     TextCodecICU::registerCodecs(addToTextCodecMap);
    239 
    240     pruneBlacklistedCodecs();
    241 }
    242 
    243 PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding)
    244 {
    245     MutexLocker lock(encodingRegistryMutex());
    246 
    247     ASSERT(textCodecMap);
    248     TextCodecFactory factory = textCodecMap->get(encoding.name());
    249     ASSERT(factory.function);
    250     return factory.function(encoding, factory.additionalData);
    251 }
    252 
    253 const char* atomicCanonicalTextEncodingName(const char* name)
    254 {
    255     if (!name || !name[0])
    256         return 0;
    257     if (!textEncodingNameMap)
    258         buildBaseTextCodecMaps();
    259 
    260     MutexLocker lock(encodingRegistryMutex());
    261 
    262     if (const char* atomicName = textEncodingNameMap->get(name))
    263         return atomicName;
    264     if (didExtendTextCodecMaps)
    265         return 0;
    266     extendTextCodecMaps();
    267     didExtendTextCodecMaps = true;
    268     return textEncodingNameMap->get(name);
    269 }
    270 
    271 template <typename CharacterType>
    272 const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length)
    273 {
    274     char buffer[maxEncodingNameLength + 1];
    275     size_t j = 0;
    276     for (size_t i = 0; i < length; ++i) {
    277         CharacterType c = characters[i];
    278         if (j == maxEncodingNameLength)
    279             return 0;
    280         buffer[j++] = c;
    281     }
    282     buffer[j] = 0;
    283     return atomicCanonicalTextEncodingName(buffer);
    284 }
    285 
    286 const char* atomicCanonicalTextEncodingName(const String& alias)
    287 {
    288     if (!alias.length())
    289         return 0;
    290 
    291     if (alias.is8Bit())
    292         return atomicCanonicalTextEncodingName<LChar>(alias.characters8(), alias.length());
    293 
    294     return atomicCanonicalTextEncodingName<UChar>(alias.characters16(), alias.length());
    295 }
    296 
    297 bool noExtendedTextEncodingNameUsed()
    298 {
    299     // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
    300     return !didExtendTextCodecMaps;
    301 }
    302 
    303 #ifndef NDEBUG
    304 void dumpTextEncodingNameMap()
    305 {
    306     unsigned size = textEncodingNameMap->size();
    307     fprintf(stderr, "Dumping %u entries in WTF::TextEncodingNameMap...\n", size);
    308 
    309     MutexLocker lock(encodingRegistryMutex());
    310 
    311     TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
    312     TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
    313     for (; it != end; ++it)
    314         fprintf(stderr, "'%s' => '%s'\n", it->key, it->value);
    315 }
    316 #endif
    317 
    318 } // namespace WTF
    319