Home | History | Annotate | Download | only in mac
      1 /*
      2  * Copyright (C) 2004, 2006, 2008 Apple Inc. All rights reserved.
      3  * Copyright (C) 2006 Alexey Proskuryakov <ap (at) nypop.com>
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 #include "config.h"
     28 #include "TextCodecMac.h"
     29 
     30 #include "CharsetData.h"
     31 #include "PlatformString.h"
     32 #include "ThreadGlobalData.h"
     33 #include <wtf/Assertions.h>
     34 #include <wtf/PassOwnPtr.h>
     35 #include <wtf/RetainPtr.h>
     36 #include <wtf/Threading.h>
     37 #include <wtf/text/CString.h>
     38 #include <wtf/unicode/CharacterNames.h>
     39 
     40 using namespace std;
     41 
     42 namespace WebCore {
     43 
     44 // We need to keep this because ICU doesn't support some of the encodings that we need:
     45 // <http://bugs.webkit.org/show_bug.cgi?id=4195>.
     46 
     47 const size_t ConversionBufferSize = 16384;
     48 
     49 static TECConverterWrapper& cachedConverterTEC()
     50 {
     51     return threadGlobalData().cachedConverterTEC();
     52 }
     53 
     54 void TextCodecMac::registerEncodingNames(EncodingNameRegistrar registrar)
     55 {
     56     TECTextEncodingID lastEncoding = invalidEncoding;
     57     const char* lastName = 0;
     58 
     59     for (size_t i = 0; CharsetTable[i].name; ++i) {
     60         if (CharsetTable[i].encoding != lastEncoding) {
     61             lastEncoding = CharsetTable[i].encoding;
     62             lastName = CharsetTable[i].name;
     63         }
     64         registrar(CharsetTable[i].name, lastName);
     65     }
     66 }
     67 
     68 static PassOwnPtr<TextCodec> newTextCodecMac(const TextEncoding&, const void* additionalData)
     69 {
     70     return new TextCodecMac(*static_cast<const TECTextEncodingID*>(additionalData));
     71 }
     72 
     73 void TextCodecMac::registerCodecs(TextCodecRegistrar registrar)
     74 {
     75     TECTextEncodingID lastEncoding = invalidEncoding;
     76 
     77     for (size_t i = 0; CharsetTable[i].name; ++i)
     78         if (CharsetTable[i].encoding != lastEncoding) {
     79             registrar(CharsetTable[i].name, newTextCodecMac, &CharsetTable[i].encoding);
     80             lastEncoding = CharsetTable[i].encoding;
     81         }
     82 }
     83 
     84 TextCodecMac::TextCodecMac(TECTextEncodingID encoding)
     85     : m_encoding(encoding)
     86     , m_numBufferedBytes(0)
     87     , m_converterTEC(0)
     88 {
     89 }
     90 
     91 TextCodecMac::~TextCodecMac()
     92 {
     93     releaseTECConverter();
     94 }
     95 
     96 void TextCodecMac::releaseTECConverter() const
     97 {
     98     if (m_converterTEC) {
     99         TECConverterWrapper& cachedConverter = cachedConverterTEC();
    100         if (cachedConverter.converter)
    101             TECDisposeConverter(cachedConverter.converter);
    102         cachedConverter.converter = m_converterTEC;
    103         cachedConverter.encoding = m_encoding;
    104         m_converterTEC = 0;
    105     }
    106 }
    107 
    108 OSStatus TextCodecMac::createTECConverter() const
    109 {
    110     TECConverterWrapper& cachedConverter = cachedConverterTEC();
    111 
    112     bool cachedEncodingEqual = cachedConverter.encoding == m_encoding;
    113     cachedConverter.encoding = invalidEncoding;
    114 
    115     if (cachedEncodingEqual && cachedConverter.converter) {
    116         m_converterTEC = cachedConverter.converter;
    117         cachedConverter.converter = 0;
    118 
    119         TECClearConverterContextInfo(m_converterTEC);
    120     } else {
    121         OSStatus status = TECCreateConverter(&m_converterTEC, m_encoding,
    122             CreateTextEncoding(kTextEncodingUnicodeDefault, kTextEncodingDefaultVariant, kUnicode16BitFormat));
    123         if (status)
    124             return status;
    125 
    126         TECSetBasicOptions(m_converterTEC, kUnicodeForceASCIIRangeMask);
    127     }
    128 
    129     return noErr;
    130 }
    131 
    132 OSStatus TextCodecMac::decode(const unsigned char* inputBuffer, int inputBufferLength, int& inputLength,
    133     void *outputBuffer, int outputBufferLength, int& outputLength)
    134 {
    135     OSStatus status;
    136     unsigned long bytesRead = 0;
    137     unsigned long bytesWritten = 0;
    138 
    139     if (m_numBufferedBytes != 0) {
    140         // Finish converting a partial character that's in our buffer.
    141 
    142         // First, fill the partial character buffer with as many bytes as are available.
    143         ASSERT(m_numBufferedBytes < sizeof(m_bufferedBytes));
    144         const int spaceInBuffer = sizeof(m_bufferedBytes) - m_numBufferedBytes;
    145         const int bytesToPutInBuffer = min(spaceInBuffer, inputBufferLength);
    146         ASSERT(bytesToPutInBuffer != 0);
    147         memcpy(m_bufferedBytes + m_numBufferedBytes, inputBuffer, bytesToPutInBuffer);
    148 
    149         // Now, do a conversion on the buffer.
    150         status = TECConvertText(m_converterTEC, m_bufferedBytes, m_numBufferedBytes + bytesToPutInBuffer, &bytesRead,
    151             reinterpret_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten);
    152         ASSERT(bytesRead <= m_numBufferedBytes + bytesToPutInBuffer);
    153 
    154         if (status == kTECPartialCharErr && bytesRead == 0) {
    155             // Handle the case where the partial character was not converted.
    156             if (bytesToPutInBuffer >= spaceInBuffer) {
    157                 LOG_ERROR("TECConvertText gave a kTECPartialCharErr but read none of the %zu bytes in the buffer", sizeof(m_bufferedBytes));
    158                 m_numBufferedBytes = 0;
    159                 status = kTECUnmappableElementErr; // should never happen, but use this error code
    160             } else {
    161                 // Tell the caller we read all the source bytes and keep them in the buffer.
    162                 m_numBufferedBytes += bytesToPutInBuffer;
    163                 bytesRead = bytesToPutInBuffer;
    164                 status = noErr;
    165             }
    166         } else {
    167             // We are done with the partial character buffer.
    168             // Also, we have read some of the bytes from the main buffer.
    169             if (bytesRead > m_numBufferedBytes) {
    170                 bytesRead -= m_numBufferedBytes;
    171             } else {
    172                 LOG_ERROR("TECConvertText accepted some bytes it previously rejected with kTECPartialCharErr");
    173                 bytesRead = 0;
    174             }
    175             m_numBufferedBytes = 0;
    176             if (status == kTECPartialCharErr) {
    177                 // While there may be a partial character problem in the small buffer,
    178                 // we have to try again and not get confused and think there is a partial
    179                 // character problem in the large buffer.
    180                 status = noErr;
    181             }
    182         }
    183     } else {
    184         status = TECConvertText(m_converterTEC, inputBuffer, inputBufferLength, &bytesRead,
    185             static_cast<unsigned char*>(outputBuffer), outputBufferLength, &bytesWritten);
    186         ASSERT(static_cast<int>(bytesRead) <= inputBufferLength);
    187     }
    188 
    189     // Work around bug 3351093, where sometimes we get kTECBufferBelowMinimumSizeErr instead of kTECOutputBufferFullStatus.
    190     if (status == kTECBufferBelowMinimumSizeErr && bytesWritten != 0)
    191         status = kTECOutputBufferFullStatus;
    192 
    193     inputLength = bytesRead;
    194     outputLength = bytesWritten;
    195     return status;
    196 }
    197 
    198 String TextCodecMac::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
    199 {
    200     // Get a converter for the passed-in encoding.
    201     if (!m_converterTEC && createTECConverter() != noErr)
    202         return String();
    203 
    204     Vector<UChar> result;
    205 
    206     const unsigned char* sourcePointer = reinterpret_cast<const unsigned char*>(bytes);
    207     int sourceLength = length;
    208     bool bufferWasFull = false;
    209     UniChar buffer[ConversionBufferSize];
    210 
    211     while ((sourceLength || bufferWasFull) && !sawError) {
    212         int bytesRead = 0;
    213         int bytesWritten = 0;
    214         OSStatus status = decode(sourcePointer, sourceLength, bytesRead, buffer, sizeof(buffer), bytesWritten);
    215         ASSERT(bytesRead <= sourceLength);
    216         sourcePointer += bytesRead;
    217         sourceLength -= bytesRead;
    218 
    219         switch (status) {
    220             case noErr:
    221             case kTECOutputBufferFullStatus:
    222                 break;
    223             case kTextMalformedInputErr:
    224             case kTextUndefinedElementErr:
    225                 // FIXME: Put FFFD character into the output string in this case?
    226                 TECClearConverterContextInfo(m_converterTEC);
    227                 if (stopOnError) {
    228                     sawError = true;
    229                     break;
    230                 }
    231                 if (sourceLength) {
    232                     sourcePointer += 1;
    233                     sourceLength -= 1;
    234                 }
    235                 break;
    236             case kTECPartialCharErr: {
    237                 // Put the partial character into the buffer.
    238                 ASSERT(m_numBufferedBytes == 0);
    239                 const int bufferSize = sizeof(m_numBufferedBytes);
    240                 if (sourceLength < bufferSize) {
    241                     memcpy(m_bufferedBytes, sourcePointer, sourceLength);
    242                     m_numBufferedBytes = sourceLength;
    243                 } else {
    244                     LOG_ERROR("TECConvertText gave a kTECPartialCharErr, but left %u bytes in the buffer", sourceLength);
    245                 }
    246                 sourceLength = 0;
    247                 break;
    248             }
    249             default:
    250                 sawError = true;
    251                 return String();
    252         }
    253 
    254         ASSERT(!(bytesWritten % sizeof(UChar)));
    255         result.append(buffer, bytesWritten / sizeof(UChar));
    256 
    257         bufferWasFull = status == kTECOutputBufferFullStatus;
    258     }
    259 
    260     if (flush) {
    261         unsigned long bytesWritten = 0;
    262         TECFlushText(m_converterTEC, reinterpret_cast<unsigned char*>(buffer), sizeof(buffer), &bytesWritten);
    263         ASSERT(!(bytesWritten % sizeof(UChar)));
    264         result.append(buffer, bytesWritten / sizeof(UChar));
    265     }
    266 
    267     String resultString = String::adopt(result);
    268 
    269     // <rdar://problem/3225472>
    270     // Simplified Chinese pages use the code A3A0 to mean "full-width space".
    271     // But GB18030 decodes it to U+E5E5, which is correct in theory but not in practice.
    272     // To work around, just change all occurences of U+E5E5 to U+3000 (ideographic space).
    273     if (m_encoding == kCFStringEncodingGB_18030_2000)
    274         resultString.replace(0xE5E5, ideographicSpace);
    275 
    276     return resultString;
    277 }
    278 
    279 CString TextCodecMac::encode(const UChar* characters, size_t length, UnencodableHandling handling)
    280 {
    281     // FIXME: We should really use TEC here instead of CFString for consistency with the other direction.
    282 
    283     // FIXME: Since there's no "force ASCII range" mode in CFString, we change the backslash into a yen sign.
    284     // Encoding will change the yen sign back into a backslash.
    285     String copy(characters, length);
    286     copy.replace('\\', m_backslashAsCurrencySymbol);
    287     RetainPtr<CFStringRef> cfs(AdoptCF, copy.createCFString());
    288 
    289     CFIndex startPos = 0;
    290     CFIndex charactersLeft = CFStringGetLength(cfs.get());
    291     Vector<char> result;
    292     size_t size = 0;
    293     UInt8 lossByte = handling == QuestionMarksForUnencodables ? '?' : 0;
    294     while (charactersLeft > 0) {
    295         CFRange range = CFRangeMake(startPos, charactersLeft);
    296         CFIndex bufferLength;
    297         CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, NULL, 0x7FFFFFFF, &bufferLength);
    298 
    299         result.grow(size + bufferLength);
    300         unsigned char* buffer = reinterpret_cast<unsigned char*>(result.data() + size);
    301         CFIndex charactersConverted = CFStringGetBytes(cfs.get(), range, m_encoding, lossByte, false, buffer, bufferLength, &bufferLength);
    302         size += bufferLength;
    303 
    304         if (charactersConverted != charactersLeft) {
    305             unsigned badChar = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted);
    306             ++charactersConverted;
    307             if ((badChar & 0xFC00) == 0xD800 && charactersConverted != charactersLeft) { // is high surrogate
    308                 UniChar low = CFStringGetCharacterAtIndex(cfs.get(), startPos + charactersConverted);
    309                 if ((low & 0xFC00) == 0xDC00) { // is low surrogate
    310                     badChar <<= 10;
    311                     badChar += low;
    312                     badChar += 0x10000 - (0xD800 << 10) - 0xDC00;
    313                     ++charactersConverted;
    314                 }
    315             }
    316             UnencodableReplacementArray entity;
    317             int entityLength = getUnencodableReplacement(badChar, handling, entity);
    318             result.grow(size + entityLength);
    319             memcpy(result.data() + size, entity, entityLength);
    320             size += entityLength;
    321         }
    322 
    323         startPos += charactersConverted;
    324         charactersLeft -= charactersConverted;
    325     }
    326     return CString(result.data(), size);
    327 }
    328 
    329 } // namespace WebCore
    330