Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
      3  * Copyright (C) 2006 Alexey Proskuryakov <ap (at) nypop.com>
      4  * Copyright (C) 2007-2009 Torch Mobile, Inc.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  * 1. Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26  */
     27 
     28 #include "config.h"
     29 #include "TextEncoding.h"
     30 
     31 #include "CString.h"
     32 #include "PlatformString.h"
     33 #include "TextCodec.h"
     34 #include "TextEncodingRegistry.h"
     35 #if USE(ICU_UNICODE)
     36 #include <unicode/unorm.h>
     37 #elif USE(QT4_UNICODE)
     38 #include <QString>
     39 #elif USE(GLIB_UNICODE)
     40 #include <glib.h>
     41 #include <wtf/gtk/GOwnPtr.h>
     42 #endif
     43 #include <wtf/HashSet.h>
     44 #include <wtf/OwnPtr.h>
     45 #include <wtf/StdLibExtras.h>
     46 
     47 namespace WebCore {
     48 
     49 static void addEncodingName(HashSet<const char*>& set, const char* name)
     50 {
     51     const char* atomicName = atomicCanonicalTextEncodingName(name);
     52     if (atomicName)
     53         set.add(atomicName);
     54 }
     55 
     56 static const TextEncoding& UTF7Encoding()
     57 {
     58     static TextEncoding globalUTF7Encoding("UTF-7");
     59     return globalUTF7Encoding;
     60 }
     61 
     62 TextEncoding::TextEncoding(const char* name)
     63     : m_name(atomicCanonicalTextEncodingName(name))
     64     , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
     65 {
     66 }
     67 
     68 TextEncoding::TextEncoding(const String& name)
     69     : m_name(atomicCanonicalTextEncodingName(name.characters(), name.length()))
     70     , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
     71 {
     72 }
     73 
     74 String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
     75 {
     76     if (!m_name)
     77         return String();
     78 
     79     return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);
     80 }
     81 
     82 CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const
     83 {
     84     if (!m_name)
     85         return CString();
     86 
     87     if (!length)
     88         return "";
     89 
     90 #if USE(ICU_UNICODE)
     91     // FIXME: What's the right place to do normalization?
     92     // It's a little strange to do it inside the encode function.
     93     // Perhaps normalization should be an explicit step done before calling encode.
     94 
     95     const UChar* source = characters;
     96     size_t sourceLength = length;
     97 
     98     Vector<UChar> normalizedCharacters;
     99 
    100     UErrorCode err = U_ZERO_ERROR;
    101     if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) {
    102         // First try using the length of the original string, since normalization to NFC rarely increases length.
    103         normalizedCharacters.grow(sourceLength);
    104         int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
    105         if (err == U_BUFFER_OVERFLOW_ERROR) {
    106             err = U_ZERO_ERROR;
    107             normalizedCharacters.resize(normalizedLength);
    108             normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
    109         }
    110         ASSERT(U_SUCCESS(err));
    111 
    112         source = normalizedCharacters.data();
    113         sourceLength = normalizedLength;
    114     }
    115     return newTextCodec(*this)->encode(source, sourceLength, handling);
    116 #elif USE(QT4_UNICODE)
    117     QString str(reinterpret_cast<const QChar*>(characters), length);
    118     str = str.normalized(QString::NormalizationForm_C);
    119     return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling);
    120 #elif USE(GLIB_UNICODE)
    121     GOwnPtr<char> UTF8Source;
    122     UTF8Source.set(g_utf16_to_utf8(characters, length, 0, 0, 0));
    123 
    124     GOwnPtr<char> UTF8Normalized;
    125     UTF8Normalized.set(g_utf8_normalize(UTF8Source.get(), -1, G_NORMALIZE_NFC));
    126 
    127     long UTF16Length;
    128     GOwnPtr<UChar> UTF16Normalized;
    129     UTF16Normalized.set(g_utf8_to_utf16(UTF8Normalized.get(), -1, 0, &UTF16Length, 0));
    130 
    131     return newTextCodec(*this)->encode(UTF16Normalized.get(), UTF16Length, handling);
    132 #elif OS(WINCE)
    133     // normalization will be done by Windows CE API
    134     OwnPtr<TextCodec> textCodec = newTextCodec(*this);
    135     return textCodec.get() ? textCodec->encode(characters, length, handling) : CString();
    136 #endif
    137 }
    138 
    139 const char* TextEncoding::domName() const
    140 {
    141     if (noExtendedTextEncodingNameUsed())
    142         return m_name;
    143 
    144     // We treat EUC-KR as windows-949 (its superset), but need to expose
    145     // the name 'EUC-KR' because the name 'windows-949' is not recognized by
    146     // most Korean web servers even though they do use the encoding
    147     // 'windows-949' with the name 'EUC-KR'.
    148     // FIXME: This is not thread-safe. At the moment, this function is
    149     // only accessed in a single thread, but eventually has to be made
    150     // thread-safe along with usesVisualOrdering().
    151     static const char* const a = atomicCanonicalTextEncodingName("windows-949");
    152     if (m_name == a)
    153         return "EUC-KR";
    154     return m_name;
    155 }
    156 
    157 bool TextEncoding::usesVisualOrdering() const
    158 {
    159     if (noExtendedTextEncodingNameUsed())
    160         return false;
    161 
    162     static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
    163     return m_name == a;
    164 }
    165 
    166 bool TextEncoding::isJapanese() const
    167 {
    168     if (noExtendedTextEncodingNameUsed())
    169         return false;
    170 
    171     DEFINE_STATIC_LOCAL(HashSet<const char*>, set, ());
    172     if (set.isEmpty()) {
    173         addEncodingName(set, "x-mac-japanese");
    174         addEncodingName(set, "cp932");
    175         addEncodingName(set, "JIS_X0201");
    176         addEncodingName(set, "JIS_X0208-1983");
    177         addEncodingName(set, "JIS_X0208-1990");
    178         addEncodingName(set, "JIS_X0212-1990");
    179         addEncodingName(set, "JIS_C6226-1978");
    180         addEncodingName(set, "Shift_JIS_X0213-2000");
    181         addEncodingName(set, "ISO-2022-JP");
    182         addEncodingName(set, "ISO-2022-JP-2");
    183         addEncodingName(set, "ISO-2022-JP-1");
    184         addEncodingName(set, "ISO-2022-JP-3");
    185         addEncodingName(set, "EUC-JP");
    186         addEncodingName(set, "Shift_JIS");
    187     }
    188     return m_name && set.contains(m_name);
    189 }
    190 
    191 UChar TextEncoding::backslashAsCurrencySymbol() const
    192 {
    193     if (noExtendedTextEncodingNameUsed())
    194         return '\\';
    195 
    196     // The text encodings below treat backslash as a currency symbol.
    197     // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information.
    198     static const char* const a = atomicCanonicalTextEncodingName("Shift_JIS_X0213-2000");
    199     static const char* const b = atomicCanonicalTextEncodingName("EUC-JP");
    200     return (m_name == a || m_name == b) ? 0x00A5 : '\\';
    201 }
    202 
    203 bool TextEncoding::isNonByteBasedEncoding() const
    204 {
    205     if (noExtendedTextEncodingNameUsed()) {
    206         return *this == UTF16LittleEndianEncoding()
    207             || *this == UTF16BigEndianEncoding();
    208     }
    209 
    210     return *this == UTF16LittleEndianEncoding()
    211         || *this == UTF16BigEndianEncoding()
    212         || *this == UTF32BigEndianEncoding()
    213         || *this == UTF32LittleEndianEncoding();
    214 }
    215 
    216 bool TextEncoding::isUTF7Encoding() const
    217 {
    218     if (noExtendedTextEncodingNameUsed())
    219         return false;
    220 
    221     return *this == UTF7Encoding();
    222 }
    223 
    224 const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
    225 {
    226     if (isNonByteBasedEncoding())
    227         return UTF8Encoding();
    228     return *this;
    229 }
    230 
    231 // HTML5 specifies that UTF-8 be used in form submission when a form is
    232 // is a part of a document in UTF-16 probably because UTF-16 is not a
    233 // byte-based encoding and can contain 0x00. By extension, the same
    234 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
    235 // but it's fraught with problems and we'd rather steer clear of it.
    236 const TextEncoding& TextEncoding::encodingForFormSubmission() const
    237 {
    238     if (isNonByteBasedEncoding() || isUTF7Encoding())
    239         return UTF8Encoding();
    240     return *this;
    241 }
    242 
    243 const TextEncoding& ASCIIEncoding()
    244 {
    245     static TextEncoding globalASCIIEncoding("ASCII");
    246     return globalASCIIEncoding;
    247 }
    248 
    249 const TextEncoding& Latin1Encoding()
    250 {
    251     static TextEncoding globalLatin1Encoding("Latin-1");
    252     return globalLatin1Encoding;
    253 }
    254 
    255 const TextEncoding& UTF16BigEndianEncoding()
    256 {
    257     static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
    258     return globalUTF16BigEndianEncoding;
    259 }
    260 
    261 const TextEncoding& UTF16LittleEndianEncoding()
    262 {
    263     static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
    264     return globalUTF16LittleEndianEncoding;
    265 }
    266 
    267 const TextEncoding& UTF32BigEndianEncoding()
    268 {
    269     static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
    270     return globalUTF32BigEndianEncoding;
    271 }
    272 
    273 const TextEncoding& UTF32LittleEndianEncoding()
    274 {
    275     static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
    276     return globalUTF32LittleEndianEncoding;
    277 }
    278 
    279 const TextEncoding& UTF8Encoding()
    280 {
    281     static TextEncoding globalUTF8Encoding("UTF-8");
    282     ASSERT(globalUTF8Encoding.isValid());
    283     return globalUTF8Encoding;
    284 }
    285 
    286 const TextEncoding& WindowsLatin1Encoding()
    287 {
    288     static TextEncoding globalWindowsLatin1Encoding("WinLatin-1");
    289     return globalWindowsLatin1Encoding;
    290 }
    291 
    292 } // namespace WebCore
    293