1 /* 2 * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 3 * Copyright (C) 2006 Alexey Proskuryakov <ap (at) nypop.com> 4 * Copyright (C) 2007-2009 Torch Mobile, Inc. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 #include "wtf/text/TextEncoding.h" 30 31 #include "wtf/text/TextEncodingRegistry.h" 32 #include <unicode/unorm.h> 33 #include "wtf/OwnPtr.h" 34 #include "wtf/StdLibExtras.h" 35 #include "wtf/text/CString.h" 36 #include "wtf/text/WTFString.h" 37 38 namespace WTF { 39 40 static const TextEncoding& UTF7Encoding() 41 { 42 static TextEncoding globalUTF7Encoding("UTF-7"); 43 return globalUTF7Encoding; 44 } 45 46 TextEncoding::TextEncoding(const char* name) 47 : m_name(atomicCanonicalTextEncodingName(name)) 48 , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol()) 49 { 50 } 51 52 TextEncoding::TextEncoding(const String& name) 53 : m_name(atomicCanonicalTextEncodingName(name)) 54 , m_backslashAsCurrencySymbol(backslashAsCurrencySymbol()) 55 { 56 } 57 58 String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const 59 { 60 if (!m_name) 61 return String(); 62 63 return newTextCodec(*this)->decode(data, length, true, stopOnError, sawError); 64 } 65 66 CString TextEncoding::encode(const String& string, UnencodableHandling handling) const 67 { 68 if (!m_name) 69 return CString(); 70 71 if (string.isEmpty()) 72 return ""; 73 74 OwnPtr<TextCodec> textCodec = newTextCodec(*this); 75 CString encodedString; 76 if (string.is8Bit()) 77 encodedString = textCodec->encode(string.characters8(), string.length(), handling); 78 else 79 encodedString = textCodec->encode(string.characters16(), string.length(), handling); 80 return encodedString; 81 } 82 83 CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const 84 { 85 if (!m_name) 86 return CString(); 87 88 if (string.isEmpty()) 89 return ""; 90 91 // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left 92 // unaffected by NFC. This is effectively the same as saying that all 93 // Latin-1 text is already normalized to NFC. 94 // Source: http://unicode.org/reports/tr15/ 95 if (string.is8Bit()) 96 return newTextCodec(*this)->encode(string.characters8(), string.length(), handling); 97 98 const UChar* source = string.characters16(); 99 size_t length = string.length(); 100 101 Vector<UChar> normalizedCharacters; 102 103 UErrorCode err = U_ZERO_ERROR; 104 if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) { 105 // First try using the length of the original string, since normalization to NFC rarely increases length. 106 normalizedCharacters.grow(length); 107 int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); 108 if (err == U_BUFFER_OVERFLOW_ERROR) { 109 err = U_ZERO_ERROR; 110 normalizedCharacters.resize(normalizedLength); 111 normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); 112 } 113 ASSERT(U_SUCCESS(err)); 114 115 source = normalizedCharacters.data(); 116 length = normalizedLength; 117 } 118 119 return newTextCodec(*this)->encode(source, length, handling); 120 } 121 122 bool TextEncoding::usesVisualOrdering() const 123 { 124 if (noExtendedTextEncodingNameUsed()) 125 return false; 126 127 static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8"); 128 return m_name == a; 129 } 130 131 UChar TextEncoding::backslashAsCurrencySymbol() const 132 { 133 return shouldShowBackslashAsCurrencySymbolIn(m_name) ? 0x00A5 : '\\'; 134 } 135 136 bool TextEncoding::isNonByteBasedEncoding() const 137 { 138 if (noExtendedTextEncodingNameUsed()) { 139 return *this == UTF16LittleEndianEncoding() 140 || *this == UTF16BigEndianEncoding(); 141 } 142 143 return *this == UTF16LittleEndianEncoding() 144 || *this == UTF16BigEndianEncoding() 145 || *this == UTF32BigEndianEncoding() 146 || *this == UTF32LittleEndianEncoding(); 147 } 148 149 bool TextEncoding::isUTF7Encoding() const 150 { 151 if (noExtendedTextEncodingNameUsed()) 152 return false; 153 154 return *this == UTF7Encoding(); 155 } 156 157 const TextEncoding& TextEncoding::closestByteBasedEquivalent() const 158 { 159 if (isNonByteBasedEncoding()) 160 return UTF8Encoding(); 161 return *this; 162 } 163 164 // HTML5 specifies that UTF-8 be used in form submission when a form is 165 // is a part of a document in UTF-16 probably because UTF-16 is not a 166 // byte-based encoding and can contain 0x00. By extension, the same 167 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding, 168 // but it's fraught with problems and we'd rather steer clear of it. 169 const TextEncoding& TextEncoding::encodingForFormSubmission() const 170 { 171 if (isNonByteBasedEncoding() || isUTF7Encoding()) 172 return UTF8Encoding(); 173 return *this; 174 } 175 176 const TextEncoding& ASCIIEncoding() 177 { 178 static TextEncoding globalASCIIEncoding("ASCII"); 179 return globalASCIIEncoding; 180 } 181 182 const TextEncoding& Latin1Encoding() 183 { 184 static TextEncoding globalLatin1Encoding("latin1"); 185 return globalLatin1Encoding; 186 } 187 188 const TextEncoding& UTF16BigEndianEncoding() 189 { 190 static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE"); 191 return globalUTF16BigEndianEncoding; 192 } 193 194 const TextEncoding& UTF16LittleEndianEncoding() 195 { 196 static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE"); 197 return globalUTF16LittleEndianEncoding; 198 } 199 200 const TextEncoding& UTF32BigEndianEncoding() 201 { 202 static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE"); 203 return globalUTF32BigEndianEncoding; 204 } 205 206 const TextEncoding& UTF32LittleEndianEncoding() 207 { 208 static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE"); 209 return globalUTF32LittleEndianEncoding; 210 } 211 212 const TextEncoding& UTF8Encoding() 213 { 214 static TextEncoding globalUTF8Encoding("UTF-8"); 215 ASSERT(globalUTF8Encoding.isValid()); 216 return globalUTF8Encoding; 217 } 218 219 const TextEncoding& WindowsLatin1Encoding() 220 { 221 static TextEncoding globalWindowsLatin1Encoding("WinLatin1"); 222 return globalWindowsLatin1Encoding; 223 } 224 225 } // namespace WTF 226