1 /* 2 * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 3 * Copyright (C) 2006 Alexey Proskuryakov <ap (at) nypop.com> 4 * Copyright (C) 2007-2009 Torch Mobile, Inc. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 #include "TextEncoding.h" 30 31 #include "CString.h" 32 #include "PlatformString.h" 33 #include "TextCodec.h" 34 #include "TextEncodingRegistry.h" 35 #if USE(ICU_UNICODE) 36 #include <unicode/unorm.h> 37 #elif USE(QT4_UNICODE) 38 #include <QString> 39 #elif USE(GLIB_UNICODE) 40 #include <glib.h> 41 #include <wtf/gtk/GOwnPtr.h> 42 #endif 43 #include <wtf/HashSet.h> 44 #include <wtf/OwnPtr.h> 45 #include <wtf/StdLibExtras.h> 46 47 namespace WebCore { 48 49 static void addEncodingName(HashSet<const char*>& set, const char* name) 50 { 51 const char* atomicName = atomicCanonicalTextEncodingName(name); 52 if (atomicName) 53 set.add(atomicName); 54 } 55 56 static const TextEncoding& UTF7Encoding() 57 { 58 static TextEncoding globalUTF7Encoding("UTF-7"); 59 return globalUTF7Encoding; 60 } 61 62 TextEncoding::TextEncoding(const char* name) 63 : m_name(atomicCanonicalTextEncodingName(name)) 64 , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol()) 65 { 66 } 67 68 TextEncoding::TextEncoding(const String& name) 69 : m_name(atomicCanonicalTextEncodingName(name.characters(), name.length())) 70 , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol()) 71 { 72 } 73 74 String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const 75 { 76 if (!m_name) 77 return String(); 78 79 return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError); 80 } 81 82 CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const 83 { 84 if (!m_name) 85 return CString(); 86 87 if (!length) 88 return ""; 89 90 #if USE(ICU_UNICODE) 91 // FIXME: What's the right place to do normalization? 92 // It's a little strange to do it inside the encode function. 93 // Perhaps normalization should be an explicit step done before calling encode. 94 95 const UChar* source = characters; 96 size_t sourceLength = length; 97 98 Vector<UChar> normalizedCharacters; 99 100 UErrorCode err = U_ZERO_ERROR; 101 if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) { 102 // First try using the length of the original string, since normalization to NFC rarely increases length. 103 normalizedCharacters.grow(sourceLength); 104 int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); 105 if (err == U_BUFFER_OVERFLOW_ERROR) { 106 err = U_ZERO_ERROR; 107 normalizedCharacters.resize(normalizedLength); 108 normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); 109 } 110 ASSERT(U_SUCCESS(err)); 111 112 source = normalizedCharacters.data(); 113 sourceLength = normalizedLength; 114 } 115 return newTextCodec(*this)->encode(source, sourceLength, handling); 116 #elif USE(QT4_UNICODE) 117 QString str(reinterpret_cast<const QChar*>(characters), length); 118 str = str.normalized(QString::NormalizationForm_C); 119 return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling); 120 #elif USE(GLIB_UNICODE) 121 GOwnPtr<char> UTF8Source; 122 UTF8Source.set(g_utf16_to_utf8(characters, length, 0, 0, 0)); 123 124 GOwnPtr<char> UTF8Normalized; 125 UTF8Normalized.set(g_utf8_normalize(UTF8Source.get(), -1, G_NORMALIZE_NFC)); 126 127 long UTF16Length; 128 GOwnPtr<UChar> UTF16Normalized; 129 UTF16Normalized.set(g_utf8_to_utf16(UTF8Normalized.get(), -1, 0, &UTF16Length, 0)); 130 131 return newTextCodec(*this)->encode(UTF16Normalized.get(), UTF16Length, handling); 132 #elif OS(WINCE) 133 // normalization will be done by Windows CE API 134 OwnPtr<TextCodec> textCodec = newTextCodec(*this); 135 return textCodec.get() ? textCodec->encode(characters, length, handling) : CString(); 136 #endif 137 } 138 139 const char* TextEncoding::domName() const 140 { 141 if (noExtendedTextEncodingNameUsed()) 142 return m_name; 143 144 // We treat EUC-KR as windows-949 (its superset), but need to expose 145 // the name 'EUC-KR' because the name 'windows-949' is not recognized by 146 // most Korean web servers even though they do use the encoding 147 // 'windows-949' with the name 'EUC-KR'. 148 // FIXME: This is not thread-safe. At the moment, this function is 149 // only accessed in a single thread, but eventually has to be made 150 // thread-safe along with usesVisualOrdering(). 151 static const char* const a = atomicCanonicalTextEncodingName("windows-949"); 152 if (m_name == a) 153 return "EUC-KR"; 154 return m_name; 155 } 156 157 bool TextEncoding::usesVisualOrdering() const 158 { 159 if (noExtendedTextEncodingNameUsed()) 160 return false; 161 162 static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8"); 163 return m_name == a; 164 } 165 166 bool TextEncoding::isJapanese() const 167 { 168 if (noExtendedTextEncodingNameUsed()) 169 return false; 170 171 DEFINE_STATIC_LOCAL(HashSet<const char*>, set, ()); 172 if (set.isEmpty()) { 173 addEncodingName(set, "x-mac-japanese"); 174 addEncodingName(set, "cp932"); 175 addEncodingName(set, "JIS_X0201"); 176 addEncodingName(set, "JIS_X0208-1983"); 177 addEncodingName(set, "JIS_X0208-1990"); 178 addEncodingName(set, "JIS_X0212-1990"); 179 addEncodingName(set, "JIS_C6226-1978"); 180 addEncodingName(set, "Shift_JIS_X0213-2000"); 181 addEncodingName(set, "ISO-2022-JP"); 182 addEncodingName(set, "ISO-2022-JP-2"); 183 addEncodingName(set, "ISO-2022-JP-1"); 184 addEncodingName(set, "ISO-2022-JP-3"); 185 addEncodingName(set, "EUC-JP"); 186 addEncodingName(set, "Shift_JIS"); 187 } 188 return m_name && set.contains(m_name); 189 } 190 191 UChar TextEncoding::backslashAsCurrencySymbol() const 192 { 193 if (noExtendedTextEncodingNameUsed()) 194 return '\\'; 195 196 // The text encodings below treat backslash as a currency symbol. 197 // See http://blogs.msdn.com/michkap/archive/2005/09/17/469941.aspx for more information. 198 static const char* const a = atomicCanonicalTextEncodingName("Shift_JIS_X0213-2000"); 199 static const char* const b = atomicCanonicalTextEncodingName("EUC-JP"); 200 return (m_name == a || m_name == b) ? 0x00A5 : '\\'; 201 } 202 203 bool TextEncoding::isNonByteBasedEncoding() const 204 { 205 if (noExtendedTextEncodingNameUsed()) { 206 return *this == UTF16LittleEndianEncoding() 207 || *this == UTF16BigEndianEncoding(); 208 } 209 210 return *this == UTF16LittleEndianEncoding() 211 || *this == UTF16BigEndianEncoding() 212 || *this == UTF32BigEndianEncoding() 213 || *this == UTF32LittleEndianEncoding(); 214 } 215 216 bool TextEncoding::isUTF7Encoding() const 217 { 218 if (noExtendedTextEncodingNameUsed()) 219 return false; 220 221 return *this == UTF7Encoding(); 222 } 223 224 const TextEncoding& TextEncoding::closestByteBasedEquivalent() const 225 { 226 if (isNonByteBasedEncoding()) 227 return UTF8Encoding(); 228 return *this; 229 } 230 231 // HTML5 specifies that UTF-8 be used in form submission when a form is 232 // is a part of a document in UTF-16 probably because UTF-16 is not a 233 // byte-based encoding and can contain 0x00. By extension, the same 234 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding, 235 // but it's fraught with problems and we'd rather steer clear of it. 236 const TextEncoding& TextEncoding::encodingForFormSubmission() const 237 { 238 if (isNonByteBasedEncoding() || isUTF7Encoding()) 239 return UTF8Encoding(); 240 return *this; 241 } 242 243 const TextEncoding& ASCIIEncoding() 244 { 245 static TextEncoding globalASCIIEncoding("ASCII"); 246 return globalASCIIEncoding; 247 } 248 249 const TextEncoding& Latin1Encoding() 250 { 251 static TextEncoding globalLatin1Encoding("Latin-1"); 252 return globalLatin1Encoding; 253 } 254 255 const TextEncoding& UTF16BigEndianEncoding() 256 { 257 static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE"); 258 return globalUTF16BigEndianEncoding; 259 } 260 261 const TextEncoding& UTF16LittleEndianEncoding() 262 { 263 static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE"); 264 return globalUTF16LittleEndianEncoding; 265 } 266 267 const TextEncoding& UTF32BigEndianEncoding() 268 { 269 static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE"); 270 return globalUTF32BigEndianEncoding; 271 } 272 273 const TextEncoding& UTF32LittleEndianEncoding() 274 { 275 static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE"); 276 return globalUTF32LittleEndianEncoding; 277 } 278 279 const TextEncoding& UTF8Encoding() 280 { 281 static TextEncoding globalUTF8Encoding("UTF-8"); 282 ASSERT(globalUTF8Encoding.isValid()); 283 return globalUTF8Encoding; 284 } 285 286 const TextEncoding& WindowsLatin1Encoding() 287 { 288 static TextEncoding globalWindowsLatin1Encoding("WinLatin-1"); 289 return globalWindowsLatin1Encoding; 290 } 291 292 } // namespace WebCore 293