Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
      3  * Copyright (C) 2007-2009 Torch Mobile, Inc.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 #include "config.h"
     28 #include "wtf/text/TextEncodingRegistry.h"
     29 
     30 #include "wtf/ASCIICType.h"
     31 #include "wtf/CurrentTime.h"
     32 #include "wtf/HashMap.h"
     33 #include "wtf/HashSet.h"
     34 #include "wtf/MainThread.h"
     35 #include "wtf/StdLibExtras.h"
     36 #include "wtf/StringExtras.h"
     37 #include "wtf/ThreadingPrimitives.h"
     38 #include "wtf/text/CString.h"
     39 #include "wtf/text/TextCodecICU.h"
     40 #include "wtf/text/TextCodecLatin1.h"
     41 #include "wtf/text/TextCodecUTF16.h"
     42 #include "wtf/text/TextCodecUTF8.h"
     43 #include "wtf/text/TextCodecUserDefined.h"
     44 #include "wtf/text/TextEncoding.h"
     45 
     46 namespace WTF {
     47 
     48 const size_t maxEncodingNameLength = 63;
     49 
     50 // Hash for all-ASCII strings that does case folding.
     51 struct TextEncodingNameHash {
     52     static bool equal(const char* s1, const char* s2)
     53     {
     54         char c1;
     55         char c2;
     56         do {
     57 #if defined(_MSC_FULL_VER) && _MSC_FULL_VER == 170051106
     58             // Workaround for a bug in the VS2012 Update 1 optimizer, remove once the fix is released.
     59             // https://connect.microsoft.com/VisualStudio/feedback/details/777533/vs2012-c-optimizing-bug-when-using-inline-and-char-return-type-x86-target-only
     60             c1 = toASCIILower(*s1++);
     61             c2 = toASCIILower(*s2++);
     62             if (c1 != c2)
     63                 return false;
     64 #else
     65             c1 = *s1++;
     66             c2 = *s2++;
     67             if (toASCIILower(c1) != toASCIILower(c2))
     68                 return false;
     69 #endif
     70         } while (c1 && c2);
     71         return !c1 && !c2;
     72     }
     73 
     74     // This algorithm is the one-at-a-time hash from:
     75     // http://burtleburtle.net/bob/hash/hashfaq.html
     76     // http://burtleburtle.net/bob/hash/doobs.html
     77     static unsigned hash(const char* s)
     78     {
     79         unsigned h = WTF::stringHashingStartValue;
     80         for (;;) {
     81             char c = *s++;
     82             if (!c) {
     83                 h += (h << 3);
     84                 h ^= (h >> 11);
     85                 h += (h << 15);
     86                 return h;
     87             }
     88             h += toASCIILower(c);
     89             h += (h << 10);
     90             h ^= (h >> 6);
     91         }
     92     }
     93 
     94     static const bool safeToCompareToEmptyOrDeleted = false;
     95 };
     96 
     97 struct TextCodecFactory {
     98     NewTextCodecFunction function;
     99     const void* additionalData;
    100     TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { }
    101 };
    102 
    103 typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap;
    104 typedef HashMap<const char*, TextCodecFactory> TextCodecMap;
    105 
    106 static Mutex& encodingRegistryMutex()
    107 {
    108     // We don't have to use AtomicallyInitializedStatic here because
    109     // this function is called on the main thread for any page before
    110     // it is used in worker threads.
    111     DEFINE_STATIC_LOCAL(Mutex, mutex, ());
    112     return mutex;
    113 }
    114 
    115 static TextEncodingNameMap* textEncodingNameMap;
    116 static TextCodecMap* textCodecMap;
    117 static bool didExtendTextCodecMaps;
    118 static HashSet<const char*>* japaneseEncodings;
    119 static HashSet<const char*>* nonBackslashEncodings;
    120 
    121 static const char textEncodingNameBlacklist[][6] = { "UTF-7" };
    122 
    123 #if ERROR_DISABLED
    124 
    125 static inline void checkExistingName(const char*, const char*) { }
    126 
    127 #else
    128 
    129 static void checkExistingName(const char* alias, const char* atomicName)
    130 {
    131     const char* oldAtomicName = textEncodingNameMap->get(alias);
    132     if (!oldAtomicName)
    133         return;
    134     if (oldAtomicName == atomicName)
    135         return;
    136     // Keep the warning silent about one case where we know this will happen.
    137     if (strcmp(alias, "ISO-8859-8-I") == 0
    138             && strcmp(oldAtomicName, "ISO-8859-8-I") == 0
    139             && strcasecmp(atomicName, "iso-8859-8") == 0)
    140         return;
    141     WTF_LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
    142 }
    143 
    144 #endif
    145 
    146 static bool isUndesiredAlias(const char* alias)
    147 {
    148     // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
    149     for (const char* p = alias; *p; ++p) {
    150         if (*p == ',')
    151             return true;
    152     }
    153     // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
    154     // problem, see bug 43554.
    155     if (0 == strcmp(alias, "8859_1"))
    156         return true;
    157     return false;
    158 }
    159 
    160 static void addToTextEncodingNameMap(const char* alias, const char* name)
    161 {
    162     ASSERT(strlen(alias) <= maxEncodingNameLength);
    163     if (isUndesiredAlias(alias))
    164         return;
    165     const char* atomicName = textEncodingNameMap->get(name);
    166     ASSERT(strcmp(alias, name) == 0 || atomicName);
    167     if (!atomicName)
    168         atomicName = name;
    169     checkExistingName(alias, atomicName);
    170     textEncodingNameMap->add(alias, atomicName);
    171 }
    172 
    173 static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData)
    174 {
    175     const char* atomicName = textEncodingNameMap->get(name);
    176     ASSERT(atomicName);
    177     textCodecMap->add(atomicName, TextCodecFactory(function, additionalData));
    178 }
    179 
    180 static void pruneBlacklistedCodecs()
    181 {
    182     for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) {
    183         const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]);
    184         if (!atomicName)
    185             continue;
    186 
    187         Vector<const char*> names;
    188         TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
    189         TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
    190         for (; it != end; ++it) {
    191             if (it->value == atomicName)
    192                 names.append(it->key);
    193         }
    194 
    195         size_t length = names.size();
    196         for (size_t j = 0; j < length; ++j)
    197             textEncodingNameMap->remove(names[j]);
    198 
    199         textCodecMap->remove(atomicName);
    200     }
    201 }
    202 
    203 static void buildBaseTextCodecMaps()
    204 {
    205     ASSERT(isMainThread());
    206     ASSERT(!textCodecMap);
    207     ASSERT(!textEncodingNameMap);
    208 
    209     textCodecMap = new TextCodecMap;
    210     textEncodingNameMap = new TextEncodingNameMap;
    211 
    212     TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
    213     TextCodecLatin1::registerCodecs(addToTextCodecMap);
    214 
    215     TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
    216     TextCodecUTF8::registerCodecs(addToTextCodecMap);
    217 
    218     TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
    219     TextCodecUTF16::registerCodecs(addToTextCodecMap);
    220 
    221     TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
    222     TextCodecUserDefined::registerCodecs(addToTextCodecMap);
    223 }
    224 
    225 static void addEncodingName(HashSet<const char*>* set, const char* name)
    226 {
    227     // We must not use atomicCanonicalTextEncodingName() because this function is called in it.
    228     const char* atomicName = textEncodingNameMap->get(name);
    229     if (atomicName)
    230         set->add(atomicName);
    231 }
    232 
    233 static void buildQuirksSets()
    234 {
    235     // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn()
    236     // and initializing the sets for them in TextEncodingRegistry.cpp look strange.
    237 
    238     ASSERT(!japaneseEncodings);
    239     ASSERT(!nonBackslashEncodings);
    240 
    241     japaneseEncodings = new HashSet<const char*>;
    242     addEncodingName(japaneseEncodings, "EUC-JP");
    243     addEncodingName(japaneseEncodings, "ISO-2022-JP");
    244     addEncodingName(japaneseEncodings, "ISO-2022-JP-1");
    245     addEncodingName(japaneseEncodings, "ISO-2022-JP-2");
    246     addEncodingName(japaneseEncodings, "ISO-2022-JP-3");
    247     addEncodingName(japaneseEncodings, "JIS_C6226-1978");
    248     addEncodingName(japaneseEncodings, "JIS_X0201");
    249     addEncodingName(japaneseEncodings, "JIS_X0208-1983");
    250     addEncodingName(japaneseEncodings, "JIS_X0208-1990");
    251     addEncodingName(japaneseEncodings, "JIS_X0212-1990");
    252     addEncodingName(japaneseEncodings, "Shift_JIS");
    253     addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000");
    254     addEncodingName(japaneseEncodings, "cp932");
    255     addEncodingName(japaneseEncodings, "x-mac-japanese");
    256 
    257     nonBackslashEncodings = new HashSet<const char*>;
    258     // The text encodings below treat backslash as a currency symbol for IE compatibility.
    259     // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
    260     addEncodingName(nonBackslashEncodings, "x-mac-japanese");
    261     addEncodingName(nonBackslashEncodings, "ISO-2022-JP");
    262     addEncodingName(nonBackslashEncodings, "EUC-JP");
    263     // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them.
    264     addEncodingName(nonBackslashEncodings, "Shift_JIS");
    265     addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000");
    266 }
    267 
    268 bool isJapaneseEncoding(const char* canonicalEncodingName)
    269 {
    270     return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName);
    271 }
    272 
    273 bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName)
    274 {
    275     return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName);
    276 }
    277 
    278 static void extendTextCodecMaps()
    279 {
    280     TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
    281     TextCodecICU::registerCodecs(addToTextCodecMap);
    282 
    283     pruneBlacklistedCodecs();
    284     buildQuirksSets();
    285 }
    286 
    287 PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding)
    288 {
    289     MutexLocker lock(encodingRegistryMutex());
    290 
    291     ASSERT(textCodecMap);
    292     TextCodecFactory factory = textCodecMap->get(encoding.name());
    293     ASSERT(factory.function);
    294     return factory.function(encoding, factory.additionalData);
    295 }
    296 
    297 const char* atomicCanonicalTextEncodingName(const char* name)
    298 {
    299     if (!name || !name[0])
    300         return 0;
    301     if (!textEncodingNameMap)
    302         buildBaseTextCodecMaps();
    303 
    304     MutexLocker lock(encodingRegistryMutex());
    305 
    306     if (const char* atomicName = textEncodingNameMap->get(name))
    307         return atomicName;
    308     if (didExtendTextCodecMaps)
    309         return 0;
    310     extendTextCodecMaps();
    311     didExtendTextCodecMaps = true;
    312     return textEncodingNameMap->get(name);
    313 }
    314 
    315 template <typename CharacterType>
    316 const char* atomicCanonicalTextEncodingName(const CharacterType* characters, size_t length)
    317 {
    318     char buffer[maxEncodingNameLength + 1];
    319     size_t j = 0;
    320     for (size_t i = 0; i < length; ++i) {
    321         CharacterType c = characters[i];
    322         if (j == maxEncodingNameLength)
    323             return 0;
    324         buffer[j++] = c;
    325     }
    326     buffer[j] = 0;
    327     return atomicCanonicalTextEncodingName(buffer);
    328 }
    329 
    330 const char* atomicCanonicalTextEncodingName(const String& alias)
    331 {
    332     if (!alias.length())
    333         return 0;
    334 
    335     if (alias.is8Bit())
    336         return atomicCanonicalTextEncodingName<LChar>(alias.characters8(), alias.length());
    337 
    338     return atomicCanonicalTextEncodingName<UChar>(alias.characters16(), alias.length());
    339 }
    340 
    341 bool noExtendedTextEncodingNameUsed()
    342 {
    343     // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
    344     return !didExtendTextCodecMaps;
    345 }
    346 
    347 #ifndef NDEBUG
    348 void dumpTextEncodingNameMap()
    349 {
    350     unsigned size = textEncodingNameMap->size();
    351     fprintf(stderr, "Dumping %u entries in WTF::TextEncodingNameMap...\n", size);
    352 
    353     MutexLocker lock(encodingRegistryMutex());
    354 
    355     TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
    356     TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
    357     for (; it != end; ++it)
    358         fprintf(stderr, "'%s' => '%s'\n", it->key, it->value);
    359 }
    360 #endif
    361 
    362 } // namespace WTF
    363