Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2006, 2007 Apple Inc. All rights reserved.
      3  * Copyright (C) 2007-2009 Torch Mobile, Inc.
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 #include "config.h"
     28 #include "TextEncodingRegistry.h"
     29 
     30 #include "PlatformString.h"
     31 #include "TextCodecLatin1.h"
     32 #include "TextCodecUserDefined.h"
     33 #include "TextCodecUTF16.h"
     34 #include <wtf/ASCIICType.h>
     35 #include <wtf/Assertions.h>
     36 #include <wtf/HashFunctions.h>
     37 #include <wtf/HashMap.h>
     38 #include <wtf/StdLibExtras.h>
     39 #include <wtf/StringExtras.h>
     40 #include <wtf/Threading.h>
     41 
     42 #if USE(ICU_UNICODE)
     43 #include "TextCodecICU.h"
     44 #endif
     45 #if PLATFORM(MAC)
     46 #include "TextCodecMac.h"
     47 #endif
     48 #if PLATFORM(QT)
     49 #include "qt/TextCodecQt.h"
     50 #endif
     51 #if USE(GLIB_UNICODE)
     52 #include "gtk/TextCodecGtk.h"
     53 #endif
     54 #if OS(WINCE) && !PLATFORM(QT)
     55 #include "TextCodecWince.h"
     56 #endif
     57 
     58 using namespace WTF;
     59 
     60 namespace WebCore {
     61 
     62 const size_t maxEncodingNameLength = 63;
     63 
     64 // Hash for all-ASCII strings that does case folding and skips any characters
     65 // that are not alphanumeric. If passed any non-ASCII characters, depends on
     66 // the behavior of isalnum -- if that returns false as it does on OS X, then
     67 // it will properly skip those characters too.
     68 struct TextEncodingNameHash {
     69 
     70     static bool equal(const char* s1, const char* s2)
     71     {
     72         char c1;
     73         char c2;
     74         do {
     75             do
     76                 c1 = *s1++;
     77             while (c1 && !isASCIIAlphanumeric(c1));
     78             do
     79                 c2 = *s2++;
     80             while (c2 && !isASCIIAlphanumeric(c2));
     81             if (toASCIILower(c1) != toASCIILower(c2))
     82                 return false;
     83         } while (c1 && c2);
     84         return !c1 && !c2;
     85     }
     86 
     87     // This algorithm is the one-at-a-time hash from:
     88     // http://burtleburtle.net/bob/hash/hashfaq.html
     89     // http://burtleburtle.net/bob/hash/doobs.html
     90     static unsigned hash(const char* s)
     91     {
     92         unsigned h = WTF::stringHashingStartValue;
     93         for (;;) {
     94             char c;
     95             do {
     96                 c = *s++;
     97                 if (!c) {
     98                     h += (h << 3);
     99                     h ^= (h >> 11);
    100                     h += (h << 15);
    101                     return h;
    102                 }
    103             } while (!isASCIIAlphanumeric(c));
    104             h += toASCIILower(c);
    105             h += (h << 10);
    106             h ^= (h >> 6);
    107         }
    108     }
    109 
    110     static const bool safeToCompareToEmptyOrDeleted = false;
    111 };
    112 
    113 struct TextCodecFactory {
    114     NewTextCodecFunction function;
    115     const void* additionalData;
    116     TextCodecFactory(NewTextCodecFunction f = 0, const void* d = 0) : function(f), additionalData(d) { }
    117 };
    118 
    119 typedef HashMap<const char*, const char*, TextEncodingNameHash> TextEncodingNameMap;
    120 typedef HashMap<const char*, TextCodecFactory> TextCodecMap;
    121 
    122 static Mutex& encodingRegistryMutex()
    123 {
    124     // We don't have to use AtomicallyInitializedStatic here because
    125     // this function is called on the main thread for any page before
    126     // it is used in worker threads.
    127     DEFINE_STATIC_LOCAL(Mutex, mutex, ());
    128     return mutex;
    129 }
    130 
    131 static TextEncodingNameMap* textEncodingNameMap;
    132 static TextCodecMap* textCodecMap;
    133 static bool didExtendTextCodecMaps;
    134 
    135 static const char* const textEncodingNameBlacklist[] = {
    136     "UTF-7"
    137 };
    138 
    139 #if ERROR_DISABLED
    140 
    141 static inline void checkExistingName(const char*, const char*) { }
    142 
    143 #else
    144 
    145 static void checkExistingName(const char* alias, const char* atomicName)
    146 {
    147     const char* oldAtomicName = textEncodingNameMap->get(alias);
    148     if (!oldAtomicName)
    149         return;
    150     if (oldAtomicName == atomicName)
    151         return;
    152     // Keep the warning silent about one case where we know this will happen.
    153     if (strcmp(alias, "ISO-8859-8-I") == 0
    154             && strcmp(oldAtomicName, "ISO-8859-8-I") == 0
    155             && strcasecmp(atomicName, "iso-8859-8") == 0)
    156         return;
    157     LOG_ERROR("alias %s maps to %s already, but someone is trying to make it map to %s",
    158         alias, oldAtomicName, atomicName);
    159 }
    160 
    161 #endif
    162 
    163 static void addToTextEncodingNameMap(const char* alias, const char* name)
    164 {
    165     ASSERT(strlen(alias) <= maxEncodingNameLength);
    166     const char* atomicName = textEncodingNameMap->get(name);
    167     ASSERT(strcmp(alias, name) == 0 || atomicName);
    168     if (!atomicName)
    169         atomicName = name;
    170     checkExistingName(alias, atomicName);
    171     textEncodingNameMap->add(alias, atomicName);
    172 }
    173 
    174 static void addToTextCodecMap(const char* name, NewTextCodecFunction function, const void* additionalData)
    175 {
    176     const char* atomicName = textEncodingNameMap->get(name);
    177     ASSERT(atomicName);
    178     textCodecMap->add(atomicName, TextCodecFactory(function, additionalData));
    179 }
    180 
    181 static void pruneBlacklistedCodecs()
    182 {
    183     size_t blacklistedCodecListLength = sizeof(textEncodingNameBlacklist) / sizeof(textEncodingNameBlacklist[0]);
    184     for (size_t i = 0; i < blacklistedCodecListLength; ++i) {
    185         const char* atomicName = textEncodingNameMap->get(textEncodingNameBlacklist[i]);
    186         if (!atomicName)
    187             continue;
    188 
    189         Vector<const char*> names;
    190         TextEncodingNameMap::const_iterator it = textEncodingNameMap->begin();
    191         TextEncodingNameMap::const_iterator end = textEncodingNameMap->end();
    192         for (; it != end; ++it) {
    193             if (it->second == atomicName)
    194                 names.append(it->first);
    195         }
    196 
    197         size_t length = names.size();
    198         for (size_t j = 0; j < length; ++j)
    199             textEncodingNameMap->remove(names[j]);
    200 
    201         textCodecMap->remove(atomicName);
    202     }
    203 }
    204 
    205 static void buildBaseTextCodecMaps()
    206 {
    207     ASSERT(isMainThread());
    208     ASSERT(!textCodecMap);
    209     ASSERT(!textEncodingNameMap);
    210 
    211     textCodecMap = new TextCodecMap;
    212     textEncodingNameMap = new TextEncodingNameMap;
    213 
    214     TextCodecLatin1::registerEncodingNames(addToTextEncodingNameMap);
    215     TextCodecLatin1::registerCodecs(addToTextCodecMap);
    216 
    217     TextCodecUTF16::registerEncodingNames(addToTextEncodingNameMap);
    218     TextCodecUTF16::registerCodecs(addToTextCodecMap);
    219 
    220     TextCodecUserDefined::registerEncodingNames(addToTextEncodingNameMap);
    221     TextCodecUserDefined::registerCodecs(addToTextCodecMap);
    222 
    223 #if USE(ICU_UNICODE)
    224     TextCodecICU::registerBaseEncodingNames(addToTextEncodingNameMap);
    225     TextCodecICU::registerBaseCodecs(addToTextCodecMap);
    226 #endif
    227 
    228 #if USE(GLIB_UNICODE)
    229     TextCodecGtk::registerBaseEncodingNames(addToTextEncodingNameMap);
    230     TextCodecGtk::registerBaseCodecs(addToTextCodecMap);
    231 #endif
    232 
    233 #if OS(WINCE) && !PLATFORM(QT)
    234     TextCodecWince::registerBaseEncodingNames(addToTextEncodingNameMap);
    235     TextCodecWince::registerBaseCodecs(addToTextCodecMap);
    236 #endif
    237 }
    238 
    239 static void extendTextCodecMaps()
    240 {
    241 #if USE(ICU_UNICODE)
    242     TextCodecICU::registerExtendedEncodingNames(addToTextEncodingNameMap);
    243     TextCodecICU::registerExtendedCodecs(addToTextCodecMap);
    244 #endif
    245 
    246 #if USE(QT4_UNICODE)
    247     TextCodecQt::registerEncodingNames(addToTextEncodingNameMap);
    248     TextCodecQt::registerCodecs(addToTextCodecMap);
    249 #endif
    250 
    251 #if PLATFORM(MAC)
    252     TextCodecMac::registerEncodingNames(addToTextEncodingNameMap);
    253     TextCodecMac::registerCodecs(addToTextCodecMap);
    254 #endif
    255 
    256 #if USE(GLIB_UNICODE)
    257     TextCodecGtk::registerExtendedEncodingNames(addToTextEncodingNameMap);
    258     TextCodecGtk::registerExtendedCodecs(addToTextCodecMap);
    259 #endif
    260 
    261 #if OS(WINCE) && !PLATFORM(QT)
    262     TextCodecWince::registerExtendedEncodingNames(addToTextEncodingNameMap);
    263     TextCodecWince::registerExtendedCodecs(addToTextCodecMap);
    264 #endif
    265 
    266     pruneBlacklistedCodecs();
    267 }
    268 
    269 PassOwnPtr<TextCodec> newTextCodec(const TextEncoding& encoding)
    270 {
    271     MutexLocker lock(encodingRegistryMutex());
    272 
    273     ASSERT(textCodecMap);
    274     TextCodecFactory factory = textCodecMap->get(encoding.name());
    275     ASSERT(factory.function);
    276     return factory.function(encoding, factory.additionalData);
    277 }
    278 
    279 const char* atomicCanonicalTextEncodingName(const char* name)
    280 {
    281     if (!name || !name[0])
    282         return 0;
    283     if (!textEncodingNameMap)
    284         buildBaseTextCodecMaps();
    285 
    286     MutexLocker lock(encodingRegistryMutex());
    287 
    288     if (const char* atomicName = textEncodingNameMap->get(name))
    289         return atomicName;
    290     if (didExtendTextCodecMaps)
    291         return 0;
    292     extendTextCodecMaps();
    293     didExtendTextCodecMaps = true;
    294     return textEncodingNameMap->get(name);
    295 }
    296 
    297 const char* atomicCanonicalTextEncodingName(const UChar* characters, size_t length)
    298 {
    299     char buffer[maxEncodingNameLength + 1];
    300     size_t j = 0;
    301     for (size_t i = 0; i < length; ++i) {
    302         UChar c = characters[i];
    303         if (isASCIIAlphanumeric(c)) {
    304             if (j == maxEncodingNameLength)
    305                 return 0;
    306             buffer[j++] = c;
    307         }
    308     }
    309     buffer[j] = 0;
    310     return atomicCanonicalTextEncodingName(buffer);
    311 }
    312 
    313 bool noExtendedTextEncodingNameUsed()
    314 {
    315     // If the calling thread did not use extended encoding names, it is fine for it to use a stale false value.
    316     return !didExtendTextCodecMaps;
    317 }
    318 
    319 } // namespace WebCore
    320