1 /* 2 * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 3 * Copyright (C) 2006 Alexey Proskuryakov <ap (at) nypop.com> 4 * Copyright (C) 2007-2009 Torch Mobile, Inc. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 #include "wtf/text/TextEncoding.h" 30 31 #include "wtf/text/TextCodec.h" 32 #include "wtf/text/TextEncodingRegistry.h" 33 #include <unicode/unorm.h> 34 #include "wtf/OwnPtr.h" 35 #include "wtf/StdLibExtras.h" 36 #include "wtf/text/CString.h" 37 #include "wtf/text/WTFString.h" 38 39 namespace WTF { 40 41 static const TextEncoding& UTF7Encoding() 42 { 43 static TextEncoding globalUTF7Encoding("UTF-7"); 44 return globalUTF7Encoding; 45 } 46 47 TextEncoding::TextEncoding(const char* name) 48 : m_name(atomicCanonicalTextEncodingName(name)) 49 , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol()) 50 { 51 } 52 53 TextEncoding::TextEncoding(const String& name) 54 : m_name(atomicCanonicalTextEncodingName(name)) 55 , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol()) 56 { 57 } 58 59 String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const 60 { 61 if (!m_name) 62 return String(); 63 64 return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError); 65 } 66 67 CString TextEncoding::encode(const String& string, UnencodableHandling handling) const 68 { 69 if (!m_name) 70 return CString(); 71 72 if (string.isEmpty()) 73 return ""; 74 75 OwnPtr<TextCodec> textCodec = newTextCodec(*this); 76 CString encodedString; 77 if (string.is8Bit()) 78 encodedString = textCodec->encode(string.characters8(), string.length(), handling); 79 else 80 encodedString = textCodec->encode(string.characters16(), string.length(), handling); 81 return encodedString; 82 } 83 84 CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const 85 { 86 if (!m_name) 87 return CString(); 88 89 if (string.isEmpty()) 90 return ""; 91 92 // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left 93 // unaffected by NFC. This is effectively the same as saying that all 94 // Latin-1 text is already normalized to NFC. 95 // Source: http://unicode.org/reports/tr15/ 96 if (string.is8Bit()) 97 return newTextCodec(*this)->encode(string.characters8(), string.length(), handling); 98 99 const UChar* source = string.characters16(); 100 size_t length = string.length(); 101 102 Vector<UChar> normalizedCharacters; 103 104 UErrorCode err = U_ZERO_ERROR; 105 if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) { 106 // First try using the length of the original string, since normalization to NFC rarely increases length. 107 normalizedCharacters.grow(length); 108 int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); 109 if (err == U_BUFFER_OVERFLOW_ERROR) { 110 err = U_ZERO_ERROR; 111 normalizedCharacters.resize(normalizedLength); 112 normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); 113 } 114 ASSERT(U_SUCCESS(err)); 115 116 source = normalizedCharacters.data(); 117 length = normalizedLength; 118 } 119 120 return newTextCodec(*this)->encode(source, length, handling); 121 } 122 123 const char* TextEncoding::domName() const 124 { 125 if (noExtendedTextEncodingNameUsed()) 126 return m_name; 127 128 // We treat EUC-KR as windows-949 (its superset), but need to expose 129 // the name 'EUC-KR' because the name 'windows-949' is not recognized by 130 // most Korean web servers even though they do use the encoding 131 // 'windows-949' with the name 'EUC-KR'. 132 // FIXME: This is not thread-safe. At the moment, this function is 133 // only accessed in a single thread, but eventually has to be made 134 // thread-safe along with usesVisualOrdering(). 135 static const char* const a = atomicCanonicalTextEncodingName("windows-949"); 136 if (m_name == a) 137 return "EUC-KR"; 138 return m_name; 139 } 140 141 bool TextEncoding::usesVisualOrdering() const 142 { 143 if (noExtendedTextEncodingNameUsed()) 144 return false; 145 146 static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8"); 147 return m_name == a; 148 } 149 150 bool TextEncoding::isJapanese() const 151 { 152 return isJapaneseEncoding(m_name); 153 } 154 155 UChar TextEncoding::backslashAsCurrencySymbol() const 156 { 157 return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\'; 158 } 159 160 bool TextEncoding::isNonByteBasedEncoding() const 161 { 162 if (noExtendedTextEncodingNameUsed()) { 163 return *this == UTF16LittleEndianEncoding() 164 || *this == UTF16BigEndianEncoding(); 165 } 166 167 return *this == UTF16LittleEndianEncoding() 168 || *this == UTF16BigEndianEncoding() 169 || *this == UTF32BigEndianEncoding() 170 || *this == UTF32LittleEndianEncoding(); 171 } 172 173 bool TextEncoding::isUTF7Encoding() const 174 { 175 if (noExtendedTextEncodingNameUsed()) 176 return false; 177 178 return *this == UTF7Encoding(); 179 } 180 181 const TextEncoding& TextEncoding::closestByteBasedEquivalent() const 182 { 183 if (isNonByteBasedEncoding()) 184 return UTF8Encoding(); 185 return *this; 186 } 187 188 // HTML5 specifies that UTF-8 be used in form submission when a form is 189 // is a part of a document in UTF-16 probably because UTF-16 is not a 190 // byte-based encoding and can contain 0x00. By extension, the same 191 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding, 192 // but it's fraught with problems and we'd rather steer clear of it. 193 const TextEncoding& TextEncoding::encodingForFormSubmission() const 194 { 195 if (isNonByteBasedEncoding() || isUTF7Encoding()) 196 return UTF8Encoding(); 197 return *this; 198 } 199 200 const TextEncoding& ASCIIEncoding() 201 { 202 static TextEncoding globalASCIIEncoding("ASCII"); 203 return globalASCIIEncoding; 204 } 205 206 const TextEncoding& Latin1Encoding() 207 { 208 static TextEncoding globalLatin1Encoding("latin1"); 209 return globalLatin1Encoding; 210 } 211 212 const TextEncoding& UTF16BigEndianEncoding() 213 { 214 static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE"); 215 return globalUTF16BigEndianEncoding; 216 } 217 218 const TextEncoding& UTF16LittleEndianEncoding() 219 { 220 static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE"); 221 return globalUTF16LittleEndianEncoding; 222 } 223 224 const TextEncoding& UTF32BigEndianEncoding() 225 { 226 static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE"); 227 return globalUTF32BigEndianEncoding; 228 } 229 230 const TextEncoding& UTF32LittleEndianEncoding() 231 { 232 static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE"); 233 return globalUTF32LittleEndianEncoding; 234 } 235 236 const TextEncoding& UTF8Encoding() 237 { 238 static TextEncoding globalUTF8Encoding("UTF-8"); 239 ASSERT(globalUTF8Encoding.isValid()); 240 return globalUTF8Encoding; 241 } 242 243 const TextEncoding& WindowsLatin1Encoding() 244 { 245 static TextEncoding globalWindowsLatin1Encoding("WinLatin1"); 246 return globalWindowsLatin1Encoding; 247 } 248 249 } // namespace WTF 250