1 /* 2 * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 3 * Copyright (C) 2006 Alexey Proskuryakov <ap (at) nypop.com> 4 * Copyright (C) 2007-2009 Torch Mobile, Inc. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 #include "TextEncoding.h" 30 31 #include "PlatformString.h" 32 #include "TextCodec.h" 33 #include "TextEncodingRegistry.h" 34 #if USE(ICU_UNICODE) 35 #include <unicode/unorm.h> 36 #elif USE(QT4_UNICODE) 37 #include <QString> 38 #elif USE(GLIB_UNICODE) 39 #include <glib.h> 40 #include "GOwnPtr.h" 41 #endif 42 #include <wtf/text/CString.h> 43 #include <wtf/OwnPtr.h> 44 #include <wtf/StdLibExtras.h> 45 46 namespace WebCore { 47 48 static const TextEncoding& UTF7Encoding() 49 { 50 static TextEncoding globalUTF7Encoding("UTF-7"); 51 return globalUTF7Encoding; 52 } 53 54 TextEncoding::TextEncoding(const char* name) 55 : m_name(atomicCanonicalTextEncodingName(name)) 56 , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol()) 57 { 58 } 59 60 TextEncoding::TextEncoding(const String& name) 61 : m_name(atomicCanonicalTextEncodingName(name.characters(), name.length())) 62 , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol()) 63 { 64 } 65 66 String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const 67 { 68 if (!m_name) 69 return String(); 70 71 return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError); 72 } 73 74 CString TextEncoding::encode(const UChar* characters, size_t length, UnencodableHandling handling) const 75 { 76 if (!m_name) 77 return CString(); 78 79 if (!length) 80 return ""; 81 82 #if USE(ICU_UNICODE) 83 // FIXME: What's the right place to do normalization? 84 // It's a little strange to do it inside the encode function. 85 // Perhaps normalization should be an explicit step done before calling encode. 86 87 const UChar* source = characters; 88 size_t sourceLength = length; 89 90 Vector<UChar> normalizedCharacters; 91 92 UErrorCode err = U_ZERO_ERROR; 93 if (unorm_quickCheck(source, sourceLength, UNORM_NFC, &err) != UNORM_YES) { 94 // First try using the length of the original string, since normalization to NFC rarely increases length. 95 normalizedCharacters.grow(sourceLength); 96 int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); 97 if (err == U_BUFFER_OVERFLOW_ERROR) { 98 err = U_ZERO_ERROR; 99 normalizedCharacters.resize(normalizedLength); 100 normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); 101 } 102 ASSERT(U_SUCCESS(err)); 103 104 source = normalizedCharacters.data(); 105 sourceLength = normalizedLength; 106 } 107 return newTextCodec(*this)->encode(source, sourceLength, handling); 108 #elif USE(QT4_UNICODE) 109 QString str(reinterpret_cast<const QChar*>(characters), length); 110 str = str.normalized(QString::NormalizationForm_C); 111 return newTextCodec(*this)->encode(reinterpret_cast<const UChar *>(str.utf16()), str.length(), handling); 112 #elif USE(GLIB_UNICODE) 113 GOwnPtr<char> UTF8Source; 114 UTF8Source.set(g_utf16_to_utf8(characters, length, 0, 0, 0)); 115 if (!UTF8Source) { 116 // If conversion to UTF-8 failed, try with the string without normalization 117 return newTextCodec(*this)->encode(characters, length, handling); 118 } 119 120 GOwnPtr<char> UTF8Normalized; 121 UTF8Normalized.set(g_utf8_normalize(UTF8Source.get(), -1, G_NORMALIZE_NFC)); 122 123 long UTF16Length; 124 GOwnPtr<UChar> UTF16Normalized; 125 UTF16Normalized.set(g_utf8_to_utf16(UTF8Normalized.get(), -1, 0, &UTF16Length, 0)); 126 127 return newTextCodec(*this)->encode(UTF16Normalized.get(), UTF16Length, handling); 128 #elif OS(WINCE) 129 // normalization will be done by Windows CE API 130 OwnPtr<TextCodec> textCodec = newTextCodec(*this); 131 return textCodec.get() ? textCodec->encode(characters, length, handling) : CString(); 132 #elif USE(BREWMP_UNICODE) 133 // FIXME: not sure if Brew MP normalizes the input string automatically 134 OwnPtr<TextCodec> textCodec = newTextCodec(*this); 135 return textCodec.get() ? textCodec->encode(characters, length, handling) : CString(); 136 #endif 137 } 138 139 const char* TextEncoding::domName() const 140 { 141 if (noExtendedTextEncodingNameUsed()) 142 return m_name; 143 144 // We treat EUC-KR as windows-949 (its superset), but need to expose 145 // the name 'EUC-KR' because the name 'windows-949' is not recognized by 146 // most Korean web servers even though they do use the encoding 147 // 'windows-949' with the name 'EUC-KR'. 148 // FIXME: This is not thread-safe. At the moment, this function is 149 // only accessed in a single thread, but eventually has to be made 150 // thread-safe along with usesVisualOrdering(). 151 static const char* const a = atomicCanonicalTextEncodingName("windows-949"); 152 if (m_name == a) 153 return "EUC-KR"; 154 return m_name; 155 } 156 157 bool TextEncoding::usesVisualOrdering() const 158 { 159 if (noExtendedTextEncodingNameUsed()) 160 return false; 161 162 static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8"); 163 return m_name == a; 164 } 165 166 bool TextEncoding::isJapanese() const 167 { 168 return isJapaneseEncoding(m_name); 169 } 170 171 UChar TextEncoding::backslashAsCurrencySymbol() const 172 { 173 return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\'; 174 } 175 176 bool TextEncoding::isNonByteBasedEncoding() const 177 { 178 if (noExtendedTextEncodingNameUsed()) { 179 return *this == UTF16LittleEndianEncoding() 180 || *this == UTF16BigEndianEncoding(); 181 } 182 183 return *this == UTF16LittleEndianEncoding() 184 || *this == UTF16BigEndianEncoding() 185 || *this == UTF32BigEndianEncoding() 186 || *this == UTF32LittleEndianEncoding(); 187 } 188 189 bool TextEncoding::isUTF7Encoding() const 190 { 191 if (noExtendedTextEncodingNameUsed()) 192 return false; 193 194 return *this == UTF7Encoding(); 195 } 196 197 const TextEncoding& TextEncoding::closestByteBasedEquivalent() const 198 { 199 if (isNonByteBasedEncoding()) 200 return UTF8Encoding(); 201 return *this; 202 } 203 204 // HTML5 specifies that UTF-8 be used in form submission when a form is 205 // is a part of a document in UTF-16 probably because UTF-16 is not a 206 // byte-based encoding and can contain 0x00. By extension, the same 207 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding, 208 // but it's fraught with problems and we'd rather steer clear of it. 209 const TextEncoding& TextEncoding::encodingForFormSubmission() const 210 { 211 if (isNonByteBasedEncoding() || isUTF7Encoding()) 212 return UTF8Encoding(); 213 return *this; 214 } 215 216 const TextEncoding& ASCIIEncoding() 217 { 218 static TextEncoding globalASCIIEncoding("ASCII"); 219 return globalASCIIEncoding; 220 } 221 222 const TextEncoding& Latin1Encoding() 223 { 224 static TextEncoding globalLatin1Encoding("latin1"); 225 return globalLatin1Encoding; 226 } 227 228 const TextEncoding& UTF16BigEndianEncoding() 229 { 230 static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE"); 231 return globalUTF16BigEndianEncoding; 232 } 233 234 const TextEncoding& UTF16LittleEndianEncoding() 235 { 236 static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE"); 237 return globalUTF16LittleEndianEncoding; 238 } 239 240 const TextEncoding& UTF32BigEndianEncoding() 241 { 242 static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE"); 243 return globalUTF32BigEndianEncoding; 244 } 245 246 const TextEncoding& UTF32LittleEndianEncoding() 247 { 248 static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE"); 249 return globalUTF32LittleEndianEncoding; 250 } 251 252 const TextEncoding& UTF8Encoding() 253 { 254 static TextEncoding globalUTF8Encoding("UTF-8"); 255 ASSERT(globalUTF8Encoding.isValid()); 256 return globalUTF8Encoding; 257 } 258 259 const TextEncoding& WindowsLatin1Encoding() 260 { 261 static TextEncoding globalWindowsLatin1Encoding("WinLatin-1"); 262 return globalWindowsLatin1Encoding; 263 } 264 265 } // namespace WebCore 266