Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
      3  * Copyright (C) 2006 Alexey Proskuryakov <ap (at) nypop.com>
      4  * Copyright (C) 2007-2009 Torch Mobile, Inc.
      5  *
      6  * Redistribution and use in source and binary forms, with or without
      7  * modification, are permitted provided that the following conditions
      8  * are met:
      9  * 1. Redistributions of source code must retain the above copyright
     10  *    notice, this list of conditions and the following disclaimer.
     11  * 2. Redistributions in binary form must reproduce the above copyright
     12  *    notice, this list of conditions and the following disclaimer in the
     13  *    documentation and/or other materials provided with the distribution.
     14  *
     15  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     16  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     18  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     19  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     20  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     21  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     22  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     23  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     25  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26  */
     27 
     28 #include "config.h"
     29 #include "wtf/text/TextEncoding.h"
     30 
     31 #include "wtf/text/TextEncodingRegistry.h"
     32 #include <unicode/unorm.h>
     33 #include "wtf/OwnPtr.h"
     34 #include "wtf/StdLibExtras.h"
     35 #include "wtf/text/CString.h"
     36 #include "wtf/text/WTFString.h"
     37 
     38 namespace WTF {
     39 
     40 static const TextEncoding& UTF7Encoding()
     41 {
     42     static TextEncoding globalUTF7Encoding("UTF-7");
     43     return globalUTF7Encoding;
     44 }
     45 
     46 TextEncoding::TextEncoding(const char* name)
     47     : m_name(atomicCanonicalTextEncodingName(name))
     48 {
     49     // Aliases are valid, but not "replacement" itself.
     50     if (m_name && isReplacementEncoding(name))
     51         m_name = 0;
     52 }
     53 
     54 TextEncoding::TextEncoding(const String& name)
     55     : m_name(atomicCanonicalTextEncodingName(name))
     56 {
     57     // Aliases are valid, but not "replacement" itself.
     58     if (m_name && isReplacementEncoding(name))
     59         m_name = 0;
     60 }
     61 
     62 String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const
     63 {
     64     if (!m_name)
     65         return String();
     66 
     67     return newTextCodec(*this)->decode(data, length, DataEOF, stopOnError, sawError);
     68 }
     69 
     70 CString TextEncoding::encode(const String& string, UnencodableHandling handling) const
     71 {
     72     if (!m_name)
     73         return CString();
     74 
     75     if (string.isEmpty())
     76         return "";
     77 
     78     OwnPtr<TextCodec> textCodec = newTextCodec(*this);
     79     CString encodedString;
     80     if (string.is8Bit())
     81         encodedString = textCodec->encode(string.characters8(), string.length(), handling);
     82     else
     83         encodedString = textCodec->encode(string.characters16(), string.length(), handling);
     84     return encodedString;
     85 }
     86 
     87 CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const
     88 {
     89     if (!m_name)
     90         return CString();
     91 
     92     if (string.isEmpty())
     93         return "";
     94 
     95     // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left
     96     // unaffected by NFC. This is effectively the same as saying that all
     97     // Latin-1 text is already normalized to NFC.
     98     // Source: http://unicode.org/reports/tr15/
     99     if (string.is8Bit())
    100         return newTextCodec(*this)->encode(string.characters8(), string.length(), handling);
    101 
    102     const UChar* source = string.characters16();
    103     size_t length = string.length();
    104 
    105     Vector<UChar> normalizedCharacters;
    106 
    107     UErrorCode err = U_ZERO_ERROR;
    108     if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) {
    109         // First try using the length of the original string, since normalization to NFC rarely increases length.
    110         normalizedCharacters.grow(length);
    111         int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err);
    112         if (err == U_BUFFER_OVERFLOW_ERROR) {
    113             err = U_ZERO_ERROR;
    114             normalizedCharacters.resize(normalizedLength);
    115             normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err);
    116         }
    117         ASSERT(U_SUCCESS(err));
    118 
    119         source = normalizedCharacters.data();
    120         length = normalizedLength;
    121     }
    122 
    123     return newTextCodec(*this)->encode(source, length, handling);
    124 }
    125 
    126 bool TextEncoding::usesVisualOrdering() const
    127 {
    128     if (noExtendedTextEncodingNameUsed())
    129         return false;
    130 
    131     static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8");
    132     return m_name == a;
    133 }
    134 
    135 bool TextEncoding::isNonByteBasedEncoding() const
    136 {
    137     if (noExtendedTextEncodingNameUsed()) {
    138         return *this == UTF16LittleEndianEncoding()
    139             || *this == UTF16BigEndianEncoding();
    140     }
    141 
    142     return *this == UTF16LittleEndianEncoding()
    143         || *this == UTF16BigEndianEncoding()
    144         || *this == UTF32BigEndianEncoding()
    145         || *this == UTF32LittleEndianEncoding();
    146 }
    147 
    148 bool TextEncoding::isUTF7Encoding() const
    149 {
    150     if (noExtendedTextEncodingNameUsed())
    151         return false;
    152 
    153     return *this == UTF7Encoding();
    154 }
    155 
    156 const TextEncoding& TextEncoding::closestByteBasedEquivalent() const
    157 {
    158     if (isNonByteBasedEncoding())
    159         return UTF8Encoding();
    160     return *this;
    161 }
    162 
    163 // HTML5 specifies that UTF-8 be used in form submission when a form is
    164 // is a part of a document in UTF-16 probably because UTF-16 is not a
    165 // byte-based encoding and can contain 0x00. By extension, the same
    166 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding,
    167 // but it's fraught with problems and we'd rather steer clear of it.
    168 const TextEncoding& TextEncoding::encodingForFormSubmission() const
    169 {
    170     if (isNonByteBasedEncoding() || isUTF7Encoding())
    171         return UTF8Encoding();
    172     return *this;
    173 }
    174 
    175 const TextEncoding& ASCIIEncoding()
    176 {
    177     static TextEncoding globalASCIIEncoding("ASCII");
    178     return globalASCIIEncoding;
    179 }
    180 
    181 const TextEncoding& Latin1Encoding()
    182 {
    183     static TextEncoding globalLatin1Encoding("latin1");
    184     return globalLatin1Encoding;
    185 }
    186 
    187 const TextEncoding& UTF16BigEndianEncoding()
    188 {
    189     static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE");
    190     return globalUTF16BigEndianEncoding;
    191 }
    192 
    193 const TextEncoding& UTF16LittleEndianEncoding()
    194 {
    195     static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE");
    196     return globalUTF16LittleEndianEncoding;
    197 }
    198 
    199 const TextEncoding& UTF32BigEndianEncoding()
    200 {
    201     static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE");
    202     return globalUTF32BigEndianEncoding;
    203 }
    204 
    205 const TextEncoding& UTF32LittleEndianEncoding()
    206 {
    207     static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE");
    208     return globalUTF32LittleEndianEncoding;
    209 }
    210 
    211 const TextEncoding& UTF8Encoding()
    212 {
    213     static TextEncoding globalUTF8Encoding("UTF-8");
    214     ASSERT(globalUTF8Encoding.isValid());
    215     return globalUTF8Encoding;
    216 }
    217 
    218 const TextEncoding& WindowsLatin1Encoding()
    219 {
    220     static TextEncoding globalWindowsLatin1Encoding("WinLatin1");
    221     return globalWindowsLatin1Encoding;
    222 }
    223 
    224 } // namespace WTF
    225