Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
      3  * Copyright (C) 2006 Alexey Proskuryakov <ap (at) nypop.com>
      4  * Copyright (C) 2007-2009 Torch Mobile, Inc.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  * 1. Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26  */
     27 
     28 #include "config.h"
     29 #include "wtf/text/TextEncoding.h"
     30 
     31 #include "wtf/text/TextCodec.h"
     32 #include "wtf/text/TextEncodingRegistry.h"
     33 #include <unicode/unorm.h>
     34 #include "wtf/OwnPtr.h"
     35 #include "wtf/StdLibExtras.h"
     36 #include "wtf/text/CString.h"
     37 #include "wtf/text/WTFString.h"
     38 
     39 namespace WTF {
     40 
     41 static const TextEncoding& UTF7Encoding()
     42 {
     43     static TextEncoding globalUTF7Encoding("UTF-7");
     44     return globalUTF7Encoding;
     45 }
     46 
     47 TextEncoding::TextEncoding(const char* name)
     48     : m_name(atomicCanonicalTextEncodingName(name))
     49     , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
     50 {
     51 }
     52 
     53 TextEncoding::TextEncoding(const String& name)
     54     : m_name(atomicCanonicalTextEncodingName(name))
     55     , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol())
     56 {
     57 }
     58 
     59 String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
     60 {
     61     if (!m_name)
     62         return String();
     63 
     64     return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError);
     65 }
     66 
     67 CString TextEncoding::encode(const String& string, UnencodableHandling handling) const
     68 {
     69     if (!m_name)
     70         return CString();
     71 
     72     if (string.isEmpty())
     73         return "";
     74 
     75     OwnPtr<TextCodec> textCodec = newTextCodec(*this);
     76     CString encodedString;
     77     if (string.is8Bit())
     78         encodedString = textCodec->encode(string.characters8(), string.length(), handling);
     79     else
     80         encodedString = textCodec->encode(string.characters16(), string.length(), handling);
     81     return encodedString;
     82 }
     83 
     84 CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const
     85 {
     86     if (!m_name)
     87         return CString();
     88 
     89     if (string.isEmpty())
     90         return "";
     91 
     92     // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left
     93     // unaffected by NFC. This is effectively the same as saying that all
     94     // Latin-1 text is already normalized to NFC.
     95     // Source: http://unicode.org/reports/tr15/
     96     if (string.is8Bit())
     97         return newTextCodec(*this)->encode(string.characters8(), string.length(), handling);
     98 
     99     const UChar* source = string.characters16();
    100     size_t length = string.length();
    101 
    102     Vector<UChar> normalizedCharacters;
    103 
    104     UErrorCode err = U_ZERO_ERROR;
    105     if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) {
    106         // First try using the length of the original string, since normalization to NFC rarely increases length.
    107         normalizedCharacters.grow(length);
    108         int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
    109         if (err == U_BUFFER_OVERFLOW_ERROR) {
    110             err = U_ZERO_ERROR;
    111             normalizedCharacters.resize(normalizedLength);
    112             normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
    113         }
    114         ASSERT(U_SUCCESS(err));
    115 
    116         source = normalizedCharacters.data();
    117         length = normalizedLength;
    118     }
    119 
    120     return newTextCodec(*this)->encode(source, length, handling);
    121 }
    122 
    123 const char* TextEncoding::domName() const
    124 {
    125     if (noExtendedTextEncodingNameUsed())
    126         return m_name;
    127 
    128     // We treat EUC-KR as windows-949 (its superset), but need to expose
    129     // the name 'EUC-KR' because the name 'windows-949' is not recognized by
    130     // most Korean web servers even though they do use the encoding
    131     // 'windows-949' with the name 'EUC-KR'.
    132     // FIXME: This is not thread-safe. At the moment, this function is
    133     // only accessed in a single thread, but eventually has to be made
    134     // thread-safe along with usesVisualOrdering().
    135     static const char* const a = atomicCanonicalTextEncodingName("windows-949");
    136     if (m_name == a)
    137         return "EUC-KR";
    138     return m_name;
    139 }
    140 
    141 bool TextEncoding::usesVisualOrdering() const
    142 {
    143     if (noExtendedTextEncodingNameUsed())
    144         return false;
    145 
    146     static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
    147     return m_name == a;
    148 }
    149 
    150 bool TextEncoding::isJapanese() const
    151 {
    152     return isJapaneseEncoding(m_name);
    153 }
    154 
    155 UChar TextEncoding::backslashAsCurrencySymbol() const
    156 {
    157     return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\';
    158 }
    159 
    160 bool TextEncoding::isNonByteBasedEncoding() const
    161 {
    162     if (noExtendedTextEncodingNameUsed()) {
    163         return *this == UTF16LittleEndianEncoding()
    164             || *this == UTF16BigEndianEncoding();
    165     }
    166 
    167     return *this == UTF16LittleEndianEncoding()
    168         || *this == UTF16BigEndianEncoding()
    169         || *this == UTF32BigEndianEncoding()
    170         || *this == UTF32LittleEndianEncoding();
    171 }
    172 
    173 bool TextEncoding::isUTF7Encoding() const
    174 {
    175     if (noExtendedTextEncodingNameUsed())
    176         return false;
    177 
    178     return *this == UTF7Encoding();
    179 }
    180 
    181 const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
    182 {
    183     if (isNonByteBasedEncoding())
    184         return UTF8Encoding();
    185     return *this;
    186 }
    187 
    188 // HTML5 specifies that UTF-8 be used in form submission when a form is
    189 // is a part of a document in UTF-16 probably because UTF-16 is not a
    190 // byte-based encoding and can contain 0x00. By extension, the same
    191 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
    192 // but it's fraught with problems and we'd rather steer clear of it.
    193 const TextEncoding& TextEncoding::encodingForFormSubmission() const
    194 {
    195     if (isNonByteBasedEncoding() || isUTF7Encoding())
    196         return UTF8Encoding();
    197     return *this;
    198 }
    199 
    200 const TextEncoding& ASCIIEncoding()
    201 {
    202     static TextEncoding globalASCIIEncoding("ASCII");
    203     return globalASCIIEncoding;
    204 }
    205 
    206 const TextEncoding& Latin1Encoding()
    207 {
    208     static TextEncoding globalLatin1Encoding("latin1");
    209     return globalLatin1Encoding;
    210 }
    211 
    212 const TextEncoding& UTF16BigEndianEncoding()
    213 {
    214     static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
    215     return globalUTF16BigEndianEncoding;
    216 }
    217 
    218 const TextEncoding& UTF16LittleEndianEncoding()
    219 {
    220     static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
    221     return globalUTF16LittleEndianEncoding;
    222 }
    223 
    224 const TextEncoding& UTF32BigEndianEncoding()
    225 {
    226     static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
    227     return globalUTF32BigEndianEncoding;
    228 }
    229 
    230 const TextEncoding& UTF32LittleEndianEncoding()
    231 {
    232     static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
    233     return globalUTF32LittleEndianEncoding;
    234 }
    235 
    236 const TextEncoding& UTF8Encoding()
    237 {
    238     static TextEncoding globalUTF8Encoding("UTF-8");
    239     ASSERT(globalUTF8Encoding.isValid());
    240     return globalUTF8Encoding;
    241 }
    242 
    243 const TextEncoding& WindowsLatin1Encoding()
    244 {
    245     static TextEncoding globalWindowsLatin1Encoding("WinLatin1");
    246     return globalWindowsLatin1Encoding;
    247 }
    248 
    249 } // namespace WTF
    250