1 /* 2 * Copyright (C) 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 3 * Copyright (C) 2006 Alexey Proskuryakov <ap (at) nypop.com> 4 * Copyright (C) 2007-2009 Torch Mobile, Inc. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 #include "wtf/text/TextEncoding.h" 30 31 #include "wtf/text/TextEncodingRegistry.h" 32 #include <unicode/unorm.h> 33 #include "wtf/OwnPtr.h" 34 #include "wtf/StdLibExtras.h" 35 #include "wtf/text/CString.h" 36 #include "wtf/text/WTFString.h" 37 38 namespace WTF { 39 40 static const TextEncoding& UTF7Encoding() 41 { 42 static TextEncoding globalUTF7Encoding("UTF-7"); 43 return globalUTF7Encoding; 44 } 45 46 TextEncoding::TextEncoding(const char* name) 47 : m_name(atomicCanonicalTextEncodingName(name)) 48 { 49 // Aliases are valid, but not "replacement" itself. 50 if (m_name && isReplacementEncoding(name)) 51 m_name = 0; 52 } 53 54 TextEncoding::TextEncoding(const String& name) 55 : m_name(atomicCanonicalTextEncodingName(name)) 56 { 57 // Aliases are valid, but not "replacement" itself. 58 if (m_name && isReplacementEncoding(name)) 59 m_name = 0; 60 } 61 62 String TextEncoding::decode(const char* data, size_t length, bool stopOnError, bool& sawError) const 63 { 64 if (!m_name) 65 return String(); 66 67 return newTextCodec(*this)->decode(data, length, DataEOF, stopOnError, sawError); 68 } 69 70 CString TextEncoding::encode(const String& string, UnencodableHandling handling) const 71 { 72 if (!m_name) 73 return CString(); 74 75 if (string.isEmpty()) 76 return ""; 77 78 OwnPtr<TextCodec> textCodec = newTextCodec(*this); 79 CString encodedString; 80 if (string.is8Bit()) 81 encodedString = textCodec->encode(string.characters8(), string.length(), handling); 82 else 83 encodedString = textCodec->encode(string.characters16(), string.length(), handling); 84 return encodedString; 85 } 86 87 CString TextEncoding::normalizeAndEncode(const String& string, UnencodableHandling handling) const 88 { 89 if (!m_name) 90 return CString(); 91 92 if (string.isEmpty()) 93 return ""; 94 95 // Text exclusively containing Latin-1 characters (U+0000..U+00FF) is left 96 // unaffected by NFC. This is effectively the same as saying that all 97 // Latin-1 text is already normalized to NFC. 98 // Source: http://unicode.org/reports/tr15/ 99 if (string.is8Bit()) 100 return newTextCodec(*this)->encode(string.characters8(), string.length(), handling); 101 102 const UChar* source = string.characters16(); 103 size_t length = string.length(); 104 105 Vector<UChar> normalizedCharacters; 106 107 UErrorCode err = U_ZERO_ERROR; 108 if (unorm_quickCheck(source, length, UNORM_NFC, &err) != UNORM_YES) { 109 // First try using the length of the original string, since normalization to NFC rarely increases length. 110 normalizedCharacters.grow(length); 111 int32_t normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), length, &err); 112 if (err == U_BUFFER_OVERFLOW_ERROR) { 113 err = U_ZERO_ERROR; 114 normalizedCharacters.resize(normalizedLength); 115 normalizedLength = unorm_normalize(source, length, UNORM_NFC, 0, normalizedCharacters.data(), normalizedLength, &err); 116 } 117 ASSERT(U_SUCCESS(err)); 118 119 source = normalizedCharacters.data(); 120 length = normalizedLength; 121 } 122 123 return newTextCodec(*this)->encode(source, length, handling); 124 } 125 126 bool TextEncoding::usesVisualOrdering() const 127 { 128 if (noExtendedTextEncodingNameUsed()) 129 return false; 130 131 static const char* const a = atomicCanonicalTextEncodingName("ISO-8859-8"); 132 return m_name == a; 133 } 134 135 bool TextEncoding::isNonByteBasedEncoding() const 136 { 137 if (noExtendedTextEncodingNameUsed()) { 138 return *this == UTF16LittleEndianEncoding() 139 || *this == UTF16BigEndianEncoding(); 140 } 141 142 return *this == UTF16LittleEndianEncoding() 143 || *this == UTF16BigEndianEncoding() 144 || *this == UTF32BigEndianEncoding() 145 || *this == UTF32LittleEndianEncoding(); 146 } 147 148 bool TextEncoding::isUTF7Encoding() const 149 { 150 if (noExtendedTextEncodingNameUsed()) 151 return false; 152 153 return *this == UTF7Encoding(); 154 } 155 156 const TextEncoding& TextEncoding::closestByteBasedEquivalent() const 157 { 158 if (isNonByteBasedEncoding()) 159 return UTF8Encoding(); 160 return *this; 161 } 162 163 // HTML5 specifies that UTF-8 be used in form submission when a form is 164 // is a part of a document in UTF-16 probably because UTF-16 is not a 165 // byte-based encoding and can contain 0x00. By extension, the same 166 // should be done for UTF-32. In case of UTF-7, it is a byte-based encoding, 167 // but it's fraught with problems and we'd rather steer clear of it. 168 const TextEncoding& TextEncoding::encodingForFormSubmission() const 169 { 170 if (isNonByteBasedEncoding() || isUTF7Encoding()) 171 return UTF8Encoding(); 172 return *this; 173 } 174 175 const TextEncoding& ASCIIEncoding() 176 { 177 static TextEncoding globalASCIIEncoding("ASCII"); 178 return globalASCIIEncoding; 179 } 180 181 const TextEncoding& Latin1Encoding() 182 { 183 static TextEncoding globalLatin1Encoding("latin1"); 184 return globalLatin1Encoding; 185 } 186 187 const TextEncoding& UTF16BigEndianEncoding() 188 { 189 static TextEncoding globalUTF16BigEndianEncoding("UTF-16BE"); 190 return globalUTF16BigEndianEncoding; 191 } 192 193 const TextEncoding& UTF16LittleEndianEncoding() 194 { 195 static TextEncoding globalUTF16LittleEndianEncoding("UTF-16LE"); 196 return globalUTF16LittleEndianEncoding; 197 } 198 199 const TextEncoding& UTF32BigEndianEncoding() 200 { 201 static TextEncoding globalUTF32BigEndianEncoding("UTF-32BE"); 202 return globalUTF32BigEndianEncoding; 203 } 204 205 const TextEncoding& UTF32LittleEndianEncoding() 206 { 207 static TextEncoding globalUTF32LittleEndianEncoding("UTF-32LE"); 208 return globalUTF32LittleEndianEncoding; 209 } 210 211 const TextEncoding& UTF8Encoding() 212 { 213 static TextEncoding globalUTF8Encoding("UTF-8"); 214 ASSERT(globalUTF8Encoding.isValid()); 215 return globalUTF8Encoding; 216 } 217 218 const TextEncoding& WindowsLatin1Encoding() 219 { 220 static TextEncoding globalWindowsLatin1Encoding("WinLatin1"); 221 return globalWindowsLatin1Encoding; 222 } 223 224 } // namespace WTF 225