Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2006, 2007, 2011 Apple Inc. All rights reserved.
      3  * Copyright (C) 2007-2009 Torch Mobile, Inc.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 #include "config.h"
     28 #include "TextEncodingRegistry.h"
     29 
     30 #include "TextCodecLatin1.h"
     31 #include "TextCodecUserDefined.h"
     32 #include "TextCodecUTF16.h"
     33 #include "TextCodecUTF8.h"
     34 #include "TextEncoding.h"
     35 #include <wtf/ASCIICType.h>
     36 #include <wtf/HashMap.h>
     37 #include <wtf/HashSet.h>
     38 #include <wtf/StdLibExtras.h>
     39 #include <wtf/StringExtras.h>
     40 #include <wtf/Threading.h>
     41 
     42 #if USE(ICU_UNICODE)
     43 #include "TextCodecICU.h"
     44 #endif
     45 #if PLATFORM(MAC)
     46 #include "TextCodecMac.h"
     47 #endif
     48 #if PLATFORM(QT)
     49 #include "qt/TextCodecQt.h"
     50 #endif
     51 #if USE(GLIB_UNICODE)
     52 #include "gtk/TextCodecGtk.h"
     53 #endif
     54 #if USE(BREWMP_UNICODE)
     55 #include "brew/TextCodecBrew.h"
     56 #endif
     57 #if OS(WINCE) && !PLATFORM(QT)
     58 #include "TextCodecWinCE.h"
     59 #endif
     60 
     61 #include <wtf/CurrentTime.h>
     62 #include <wtf/text/CString.h>
     63 
     64 using namespace WTF;
     65 
     66 namespace WebCore {
     67 
     68 const size_t maxEncodingNameLength = 63;
     69 
     70 // Hash for all-ASCII strings that does case folding.
     71 struct TextEncodingNameHash {
     72     static bool equal(const char* s1, const char* s2)
     73     {
     74         char c1;
     75         char c2;
     76         do {
     77             c1 = *s1++;
     78             c2 = *s2++;
     79             if (toASCIILower(c1) != toASCIILower(c2))
     80                 return false;
     81         } while (c1 && c2);
     82         return !c1 && !c2;
     83     }
     84 
     85     // This algorithm is the one-at-a-time hash from:
     86     // http://burtleburtle.net/bob/hash/hashfaq.html
     87     // http://burtleburtle.net/bob/hash/doobs.html
     88     static unsigned hash(const char* s)
     89     {
     90         unsigned h = WTF::stringHashingStartValue;
     91         for (;;) {
     92             char c = *s++;
     93             if (!c) {
     94                 h += (h << 3);
     95                 h ^= (h >> 11);
     96                 h += (h << 15);
     97                 return h;
     98             }
     99             h += toASCIILower(c);
    100             h += (h << 10);
    101             h ^= (h >> 6);
    102         }
    103     }
    104 
    105     static const bool safeToCompareToEmptyOrDeleted = false;
    106 };
    107 
    108 struct TextCodecFactory {
    109     NewTextCodecFunction function;
    110     const void* additionalData;
    111     TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { }
    112 };
    113 
    114 typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap;
    115 typedef HashMap<const char*, TextCodecFactory> TextCodecMap;
    116 
    117 static Mutex& encodingRegistryMutex()
    118 {
    119     // We don't have to use AtomicallyInitializedStatic here because
    120     // this function is called on the main thread for any page before
    121     // it is used in worker threads.
    122     DEFINE_STATIC_LOCAL(Mutex, mutex, ());
    123     return mutex;
    124 }
    125 
    126 static TextEncodingNameMap* textEncodingNameMap;
    127 static TextCodecMap* textCodecMap;
    128 static bool didExtendTextCodecMaps;
    129 static HashSet<const char*>* japaneseEncodings;
    130 static HashSet<const char*>* nonBackslashEncodings;
    131 
    132 static const char* const textEncodingNameBlacklist[] = { "UTF-7" };
    133 
    134 #if ERROR_DISABLED
    135 
    136 static inline void checkExistingName(const char*, const char*) { }
    137 
    138 #else
    139 
    140 static void checkExistingName(const char* alias, const char* atomicName)
    141 {
    142     const char* oldAtomicName = textEncodingNameMap->get(alias);
    143     if (!oldAtomicName)
    144         return;
    145     if (oldAtomicName == atomicName)
    146         return;
    147     // Keep the warning silent about one case where we know this will happen.
    148     if (strcmp(alias, "ISO-8859-8-I") == 0
    149             && strcmp(oldAtomicName, "ISO-8859-8-I") == 0
    150             && strcasecmp(atomicName, "iso-8859-8") == 0)
    151         return;
    152     LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s", alias, oldAtomicName, atomicName);
    153 }
    154 
    155 #endif
    156 
    157 static bool isUndesiredAlias(const char* alias)
    158 {
    159     // Reject aliases with version numbers that are supported by some back-ends (such as "ISO_2022,locale=ja,version=0" in ICU).
    160     for (const char* p = alias; *p; ++p) {
    161         if (*p == ',')
    162             return true;
    163     }
    164     // 8859_1 is known to (at least) ICU, but other browsers don't support this name - and having it caused a compatibility
    165     // problem, see bug 43554.
    166     if (0 == strcmp(alias, "8859_1"))
    167         return true;
    168     return false;
    169 }
    170 
    171 static void addToTextEncodingNameMap(const char* alias, const char* name)
    172 {
    173     ASSERT(strlen(alias) <= maxEncodingNameLength);
    174     if (isUndesiredAlias(alias))
    175         return;
    176     const char* atomicName = textEncodingNameMap->get(name);
    177     ASSERT(strcmp(alias, name) == 0 || atomicName);
    178     if (!atomicName)
    179         atomicName = name;
    180     checkExistingName(alias, atomicName);
    181     textEncodingNameMap->add(alias, atomicName);
    182 }
    183 
    184 static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData)
    185 {
    186     const char* atomicName = textEncodingNameMap->get(name);
    187     ASSERT(atomicName);
    188     textCodecMap->add(atomicName, TextCodecFactory(function, additionalData));
    189 }
    190 
    191 static void pruneBlacklistedCodecs()
    192 {
    193     for (size_t i = 0; i < WTF_ARRAY_LENGTH(textEncodingNameBlacklist); ++i) {
    194         const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]);
    195         if (!atomicName)
    196             continue;
    197 
    198         Vector<const char*> names;
    199         TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
    200         TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
    201         for (; it != end; ++it) {
    202             if (it->second == atomicName)
    203                 names.append(it->first);
    204         }
    205 
    206         size_t length = names.size();
    207         for (size_t j = 0; j < length; ++j)
    208             textEncodingNameMap->remove(names[j]);
    209 
    210         textCodecMap->remove(atomicName);
    211     }
    212 }
    213 
    214 static void buildBaseTextCodecMaps()
    215 {
    216     ASSERT(isMainThread());
    217     ASSERT(!textCodecMap);
    218     ASSERT(!textEncodingNameMap);
    219 
    220     textCodecMap = new TextCodecMap;
    221     textEncodingNameMap = new TextEncodingNameMap;
    222 
    223     TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
    224     TextCodecLatin1::registerCodecs(addToTextCodecMap);
    225 
    226     TextCodecUTF8::registerEncodingNames(addToTextEncodingNameMap);
    227     TextCodecUTF8::registerCodecs(addToTextCodecMap);
    228 
    229     TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
    230     TextCodecUTF16::registerCodecs(addToTextCodecMap);
    231 
    232     TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
    233     TextCodecUserDefined::registerCodecs(addToTextCodecMap);
    234 
    235 #if USE(GLIB_UNICODE)
    236     // FIXME: This is not needed. The code above covers all the base codecs.
    237     TextCodecGtk::registerBaseEncodingNames(addToTextEncodingNameMap);
    238     TextCodecGtk::registerBaseCodecs(addToTextCodecMap);
    239 #endif
    240 }
    241 
    242 static void addEncodingName(HashSet<const char*>* set, const char* name)
    243 {
    244     // We must not use atomicCanonicalTextEncodingName() because this function is called in it.
    245     const char* atomicName = textEncodingNameMap->get(name);
    246     if (atomicName)
    247         set->add(atomicName);
    248 }
    249 
    250 static void buildQuirksSets()
    251 {
    252     // FIXME: Having isJapaneseEncoding() and shouldShowBackslashAsCurrencySymbolIn()
    253     // and initializing the sets for them in TextEncodingRegistry.cpp look strange.
    254 
    255     ASSERT(!japaneseEncodings);
    256     ASSERT(!nonBackslashEncodings);
    257 
    258     japaneseEncodings = new HashSet<const char*>;
    259     addEncodingName(japaneseEncodings, "EUC-JP");
    260     addEncodingName(japaneseEncodings, "ISO-2022-JP");
    261     addEncodingName(japaneseEncodings, "ISO-2022-JP-1");
    262     addEncodingName(japaneseEncodings, "ISO-2022-JP-2");
    263     addEncodingName(japaneseEncodings, "ISO-2022-JP-3");
    264     addEncodingName(japaneseEncodings, "JIS_C6226-1978");
    265     addEncodingName(japaneseEncodings, "JIS_X0201");
    266     addEncodingName(japaneseEncodings, "JIS_X0208-1983");
    267     addEncodingName(japaneseEncodings, "JIS_X0208-1990");
    268     addEncodingName(japaneseEncodings, "JIS_X0212-1990");
    269     addEncodingName(japaneseEncodings, "Shift_JIS");
    270     addEncodingName(japaneseEncodings, "Shift_JIS_X0213-2000");
    271     addEncodingName(japaneseEncodings, "cp932");
    272     addEncodingName(japaneseEncodings, "x-mac-japanese");
    273 
    274     nonBackslashEncodings = new HashSet<const char*>;
    275     // The text encodings below treat backslash as a currency symbol for IE compatibility.
    276     // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
    277     addEncodingName(nonBackslashEncodings, "x-mac-japanese");
    278     addEncodingName(nonBackslashEncodings, "ISO-2022-JP");
    279     addEncodingName(nonBackslashEncodings, "EUC-JP");
    280     // Shift_JIS_X0213-2000 is not the same encoding as Shift_JIS on Mac. We need to register both of them.
    281     addEncodingName(nonBackslashEncodings, "Shift_JIS");
    282     addEncodingName(nonBackslashEncodings, "Shift_JIS_X0213-2000");
    283 }
    284 
    285 bool isJapaneseEncoding(const char* canonicalEncodingName)
    286 {
    287     return canonicalEncodingName && japaneseEncodings && japaneseEncodings->contains(canonicalEncodingName);
    288 }
    289 
    290 bool shouldShowBackslashAsCurrencySymbolIn(const char* canonicalEncodingName)
    291 {
    292     return canonicalEncodingName && nonBackslashEncodings && nonBackslashEncodings->contains(canonicalEncodingName);
    293 }
    294 
    295 static void extendTextCodecMaps()
    296 {
    297 #if USE(ICU_UNICODE)
    298     TextCodecICU::registerEncodingNames(addToTextEncodingNameMap);
    299     TextCodecICU::registerCodecs(addToTextCodecMap);
    300 #endif
    301 
    302 #if USE(QT4_UNICODE)
    303     TextCodecQt::registerEncodingNames(addToTextEncodingNameMap);
    304     TextCodecQt::registerCodecs(addToTextCodecMap);
    305 #endif
    306 
    307 #if PLATFORM(MAC)
    308     TextCodecMac::registerEncodingNames(addToTextEncodingNameMap);
    309     TextCodecMac::registerCodecs(addToTextCodecMap);
    310 #endif
    311 
    312 #if USE(GLIB_UNICODE)
    313     TextCodecGtk::registerExtendedEncodingNames(addToTextEncodingNameMap);
    314     TextCodecGtk::registerExtendedCodecs(addToTextCodecMap);
    315 #endif
    316 
    317 #if OS(WINCE) && !PLATFORM(QT)
    318     TextCodecWinCE::registerExtendedEncodingNames(addToTextEncodingNameMap);
    319     TextCodecWinCE::registerExtendedCodecs(addToTextCodecMap);
    320 #endif
    321 
    322     pruneBlacklistedCodecs();
    323     buildQuirksSets();
    324 }
    325 
    326 PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding)
    327 {
    328     MutexLocker lock(encodingRegistryMutex());
    329 
    330     ASSERT(textCodecMap);
    331     TextCodecFactory factory = textCodecMap->get(encoding.name());
    332     ASSERT(factory.function);
    333     return factory.function(encoding, factory.additionalData);
    334 }
    335 
    336 const char* atomicCanonicalTextEncodingName(const char* name)
    337 {
    338     if (!name || !name[0])
    339         return 0;
    340     if (!textEncodingNameMap)
    341         buildBaseTextCodecMaps();
    342 
    343     MutexLocker lock(encodingRegistryMutex());
    344 
    345     if (const char* atomicName = textEncodingNameMap->get(name))
    346         return atomicName;
    347     if (didExtendTextCodecMaps)
    348         return 0;
    349     extendTextCodecMaps();
    350     didExtendTextCodecMaps = true;
    351     return textEncodingNameMap->get(name);
    352 }
    353 
    354 const char* atomicCanonicalTextEncodingName(const UChar* characters, size_t length)
    355 {
    356     char buffer[maxEncodingNameLength + 1];
    357     size_t j = 0;
    358     for (size_t i = 0; i < length; ++i) {
    359         UChar c = characters[i];
    360         if (j == maxEncodingNameLength)
    361             return 0;
    362         buffer[j++] = c;
    363     }
    364     buffer[j] = 0;
    365     return atomicCanonicalTextEncodingName(buffer);
    366 }
    367 
    368 bool noExtendedTextEncodingNameUsed()
    369 {
    370     // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
    371     return !didExtendTextCodecMaps;
    372 }
    373 
    374 #ifndef NDEBUG
    375 void dumpTextEncodingNameMap()
    376 {
    377     unsigned size = textEncodingNameMap->size();
    378     fprintf(stderr, "Dumping %u entries in WebCore::textEncodingNameMap...\n", size);
    379 
    380     MutexLocker lock(encodingRegistryMutex());
    381 
    382     TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
    383     TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
    384     for (; it != end; ++it)
    385         fprintf(stderr, "'%s' => '%s'\n", it->first, it->second);
    386 }
    387 #endif
    388 
    389 } // namespace WebCore
    390