1 /* 2 * Copyright (C) 2004, 2006, 2008 Apple Inc. All rights reserved. 3 * Copyright (C) 2006 Alexey Proskuryakov <ap (at) nypop.com> 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY 15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "config.h" 28 #include "TextCodecMac.h" 29 30 #include "CharsetData.h" 31 #include "PlatformString.h" 32 #include "ThreadGlobalData.h" 33 #include <wtf/Assertions.h> 34 #include <wtf/PassOwnPtr.h> 35 #include <wtf/RetainPtr.h> 36 #include <wtf/Threading.h> 37 #include <wtf/text/CString.h> 38 #include <wtf/unicode/CharacterNames.h> 39 40 using namespace std; 41 42 namespace WebCore { 43 44 // We need to keep this because ICU doesn't support some of the encodings that we need: 45 // <http://bugs.webkit.org/show_bug.cgi?id=4195>. 46 47 const size_t ConversionBufferSize = 16384; 48 49 static TECConverterWrapper& cachedConverterTEC() 50 { 51 return threadGlobalData().cachedConverterTEC(); 52 } 53 54 void TextCodecMac::registerEncodingNames(EncodingNameRegistrar registrar) 55 { 56 TECTextEncodingID lastEncoding = invalidEncoding; 57 const char* lastName = 0; 58 59 for (size_t i = 0; CharsetTable[i].name; ++i) { 60 if (CharsetTable[i].encoding != lastEncoding) { 61 lastEncoding = CharsetTable[i].encoding; 62 lastName = CharsetTable[i].name; 63 } 64 registrar(CharsetTable[i].name, lastName); 65 } 66 } 67 68 static PassOwnPtr<TextCodec> newTextCodecMac(const TextEncoding&, const void* additionalData) 69 { 70 return new TextCodecMac(*static_cast<const TECTextEncodingID*>(additionalData)); 71 } 72 73 void TextCodecMac::registerCodecs(TextCodecRegistrar registrar) 74 { 75 TECTextEncodingID lastEncoding = invalidEncoding; 76 77 for (size_t i = 0; CharsetTable[i].name; ++i) 78 if (CharsetTable[i].encoding != lastEncoding) { 79 registrar(CharsetTable[i].name, newTextCodecMac, &CharsetTable[i].encoding); 80 lastEncoding = CharsetTable[i].encoding; 81 } 82 } 83 84 TextCodecMac::TextCodecMac(TECTextEncodingID encoding) 85 : m_encoding(encoding) 86 , m_numBufferedBytes(0) 87 , m_converterTEC(0) 88 { 89 } 90 91 TextCodecMac::~TextCodecMac() 92 { 93 releaseTECConverter(); 94 } 95 96 void TextCodecMac::releaseTECConverter() const 97 { 98 if (m_converterTEC) { 99 TECConverterWrapper& cachedConverter = cachedConverterTEC(); 100 if (cachedConverter.converter) 101 TECDisposeConverter(cachedConverter.converter); 102 cachedConverter.converter = m_converterTEC; 103 cachedConverter.encoding = m_encoding; 104 m_converterTEC = 0; 105 } 106 } 107 108 OSStatus TextCodecMac::createTECConverter() const 109 { 110 TECConverterWrapper& cachedConverter = cachedConverterTEC(); 111 112 bool cachedEncodingEqual = cachedConverter.encoding == m_encoding; 113 cachedConverter.encoding = invalidEncoding; 114 115 if (cachedEncodingEqual && cachedConverter.converter) { 116 m_converterTEC = cachedConverter.converter; 117 cachedConverter.converter = 0; 118 119 TECClearConverterContextInfo(m_converterTEC); 120 } else { 121 OSStatus status = TECCreateConverter(&m_converterTEC, m_encoding, 122 CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat)); 123 if (status) 124 return status; 125 126 TECSetBasicOptions(m_converterTEC, kUnicodeForceASCIIRangeMask); 127 } 128 129 return noErr; 130 } 131 132 OSStatus TextCodecMac::decode(const unsigned char* inputBuffer, int inputBufferLength, int& inputLength, 133 void *outputBuffer, int outputBufferLength, int& outputLength) 134 { 135 OSStatus status; 136 unsigned long bytesRead = 0; 137 unsigned long bytesWritten = 0; 138 139 if (m_numBufferedBytes != 0) { 140 // Finish converting a partial character that's in our buffer. 141 142 // First, fill the partial character buffer with as many bytes as are available. 143 ASSERT(m_numBufferedBytes < sizeof(m_bufferedBytes)); 144 const int spaceInBuffer = sizeof(m_bufferedBytes) - m_numBufferedBytes; 145 const int bytesToPutInBuffer = min(spaceInBuffer, inputBufferLength); 146 ASSERT(bytesToPutInBuffer != 0); 147 memcpy(m_bufferedBytes + m_numBufferedBytes, inputBuffer, bytesToPutInBuffer); 148 149 // Now, do a conversion on the buffer. 150 status = TECConvertText(m_converterTEC, m_bufferedBytes, m_numBufferedBytes + bytesToPutInBuffer, &bytesRead, 151 reinterpret_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten); 152 ASSERT(bytesRead <= m_numBufferedBytes + bytesToPutInBuffer); 153 154 if (status == kTECPartialCharErr && bytesRead == 0) { 155 // Handle the case where the partial character was not converted. 156 if (bytesToPutInBuffer >= spaceInBuffer) { 157 LOG_ERROR("TECConvertText gave a kTECPartialCharErr but read none of the %zu bytes in the buffer", sizeof(m_bufferedBytes)); 158 m_numBufferedBytes = 0; 159 status = kTECUnmappableElementErr; // should never happen, but use this error code 160 } else { 161 // Tell the caller we read all the source bytes and keep them in the buffer. 162 m_numBufferedBytes += bytesToPutInBuffer; 163 bytesRead = bytesToPutInBuffer; 164 status = noErr; 165 } 166 } else { 167 // We are done with the partial character buffer. 168 // Also, we have read some of the bytes from the main buffer. 169 if (bytesRead > m_numBufferedBytes) { 170 bytesRead -= m_numBufferedBytes; 171 } else { 172 LOG_ERROR("TECConvertText accepted some bytes it previously rejected with kTECPartialCharErr"); 173 bytesRead = 0; 174 } 175 m_numBufferedBytes = 0; 176 if (status == kTECPartialCharErr) { 177 // While there may be a partial character problem in the small buffer, 178 // we have to try again and not get confused and think there is a partial 179 // character problem in the large buffer. 180 status = noErr; 181 } 182 } 183 } else { 184 status = TECConvertText(m_converterTEC, inputBuffer, inputBufferLength, &bytesRead, 185 static_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten); 186 ASSERT(static_cast<int>(bytesRead) <= inputBufferLength); 187 } 188 189 // Work around bug 3351093, where sometimes we get kTECBufferBelowMinimumSizeErr instead of kTECOutputBufferFullStatus. 190 if (status == kTECBufferBelowMinimumSizeErr && bytesWritten != 0) 191 status = kTECOutputBufferFullStatus; 192 193 inputLength = bytesRead; 194 outputLength = bytesWritten; 195 return status; 196 } 197 198 String TextCodecMac::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) 199 { 200 // Get a converter for the passed-in encoding. 201 if (!m_converterTEC && createTECConverter() != noErr) 202 return String(); 203 204 Vector<UChar> result; 205 206 const unsigned char* sourcePointer = reinterpret_cast<const unsigned char*>(bytes); 207 int sourceLength = length; 208 bool bufferWasFull = false; 209 UniChar buffer[ConversionBufferSize]; 210 211 while ((sourceLength || bufferWasFull) && !sawError) { 212 int bytesRead = 0; 213 int bytesWritten = 0; 214 OSStatus status = decode(sourcePointer, sourceLength, bytesRead, buffer, sizeof(buffer), bytesWritten); 215 ASSERT(bytesRead <= sourceLength); 216 sourcePointer += bytesRead; 217 sourceLength -= bytesRead; 218 219 switch (status) { 220 case noErr: 221 case kTECOutputBufferFullStatus: 222 break; 223 case kTextMalformedInputErr: 224 case kTextUndefinedElementErr: 225 // FIXME: Put FFFD character into the output string in this case? 226 TECClearConverterContextInfo(m_converterTEC); 227 if (stopOnError) { 228 sawError = true; 229 break; 230 } 231 if (sourceLength) { 232 sourcePointer += 1; 233 sourceLength -= 1; 234 } 235 break; 236 case kTECPartialCharErr: { 237 // Put the partial character into the buffer. 238 ASSERT(m_numBufferedBytes == 0); 239 const int bufferSize = sizeof(m_numBufferedBytes); 240 if (sourceLength < bufferSize) { 241 memcpy(m_bufferedBytes, sourcePointer, sourceLength); 242 m_numBufferedBytes = sourceLength; 243 } else { 244 LOG_ERROR("TECConvertText gave a kTECPartialCharErr, but left %u bytes in the buffer", sourceLength); 245 } 246 sourceLength = 0; 247 break; 248 } 249 default: 250 sawError = true; 251 return String(); 252 } 253 254 ASSERT(!(bytesWritten % sizeof(UChar))); 255 result.append(buffer, bytesWritten / sizeof(UChar)); 256 257 bufferWasFull = status == kTECOutputBufferFullStatus; 258 } 259 260 if (flush) { 261 unsigned long bytesWritten = 0; 262 TECFlushText(m_converterTEC, reinterpret_cast<unsigned char*>(buffer), sizeof(buffer), &bytesWritten); 263 ASSERT(!(bytesWritten % sizeof(UChar))); 264 result.append(buffer, bytesWritten / sizeof(UChar)); 265 } 266 267 String resultString = String::adopt(result); 268 269 // <rdar://problem/3225472> 270 // Simplified Chinese pages use the code A3A0 to mean "full-width space". 271 // But GB18030 decodes it to U+E5E5, which is correct in theory but not in practice. 272 // To work around, just change all occurences of U+E5E5 to U+3000 (ideographic space). 273 if (m_encoding == kCFStringEncodingGB_18030_2000) 274 resultString.replace(0xE5E5, ideographicSpace); 275 276 return resultString; 277 } 278 279 CString TextCodecMac::encode(const UChar* characters, size_t length, UnencodableHandling handling) 280 { 281 // FIXME: We should really use TEC here instead of CFString for consistency with the other direction. 282 283 // FIXME: Since there's no "force ASCII range" mode in CFString, we change the backslash into a yen sign. 284 // Encoding will change the yen sign back into a backslash. 285 String copy(characters, length); 286 copy.replace('\\', m_backslashAsCurrencySymbol); 287 RetainPtr<CFStringRef> cfs(AdoptCF, copy.createCFString()); 288 289 CFIndex startPos = 0; 290 CFIndex charactersLeft = CFStringGetLength(cfs.get()); 291 Vector<char> result; 292 size_t size = 0; 293 UInt8 lossByte = handling == QuestionMarksForUnencodables ? '?' : 0; 294 while (charactersLeft > 0) { 295 CFRange range = CFRangeMake(startPos, charactersLeft); 296 CFIndex bufferLength; 297 CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, NULL, 0x7FFFFFFF, &bufferLength); 298 299 result.grow(size + bufferLength); 300 unsigned char* buffer = reinterpret_cast<unsigned char*>(result.data() + size); 301 CFIndex charactersConverted = CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, buffer, bufferLength, &bufferLength); 302 size += bufferLength; 303 304 if (charactersConverted != charactersLeft) { 305 unsigned badChar = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted); 306 ++charactersConverted; 307 if ((badChar & 0xFC00) == 0xD800 && charactersConverted != charactersLeft) { // is high surrogate 308 UniChar low = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted); 309 if ((low & 0xFC00) == 0xDC00) { // is low surrogate 310 badChar <<= 10; 311 badChar += low; 312 badChar += 0x10000 - (0xD800 << 10) - 0xDC00; 313 ++charactersConverted; 314 } 315 } 316 UnencodableReplacementArray entity; 317 int entityLength = getUnencodableReplacement(badChar, handling, entity); 318 result.grow(size + entityLength); 319 memcpy(result.data() + size, entity, entityLength); 320 size += entityLength; 321 } 322 323 startPos += charactersConverted; 324 charactersLeft -= charactersConverted; 325 } 326 return CString(result.data(), size); 327 } 328 329 } // namespace WebCore 330