Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #include "config.h"
     27 #include "wtf/text/TextCodecUTF8.h"
     28 
     29 #include "wtf/text/TextCodecASCIIFastPath.h"
     30 #include "wtf/text/CString.h"
     31 #include "wtf/text/StringBuffer.h"
     32 #include "wtf/unicode/CharacterNames.h"
     33 
     34 using namespace WTF;
     35 using namespace WTF::Unicode;
     36 using namespace std;
     37 
     38 namespace WTF {
     39 
     40 const int nonCharacter = -1;
     41 
     42 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
     43 {
     44     return adoptPtr(new TextCodecUTF8);
     45 }
     46 
     47 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
     48 {
     49     registrar("UTF-8", "UTF-8");
     50 
     51     // Additional aliases that originally were present in the encoding
     52     // table in WebKit on Macintosh, and subsequently added by
     53     // TextCodecICU. Perhaps we can prove some are not used on the web
     54     // and remove them.
     55     registrar("unicode11utf8", "UTF-8");
     56     registrar("unicode20utf8", "UTF-8");
     57     registrar("utf8", "UTF-8");
     58     registrar("x-unicode20utf8", "UTF-8");
     59 }
     60 
     61 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
     62 {
     63     registrar("UTF-8", create, 0);
     64 }
     65 
     66 static inline int nonASCIISequenceLength(uint8_t firstByte)
     67 {
     68     static const uint8_t lengths[256] = {
     69         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     70         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     71         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     72         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     73         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     74         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     75         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     76         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     77         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     78         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     79         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     80         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     81         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     82         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     83         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
     84         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
     85     };
     86     return lengths[firstByte];
     87 }
     88 
     89 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
     90 {
     91     ASSERT(!isASCII(sequence[0]));
     92     if (length == 2) {
     93         ASSERT(sequence[0] <= 0xDF);
     94         if (sequence[0] < 0xC2)
     95             return nonCharacter;
     96         if (sequence[1] < 0x80 || sequence[1] > 0xBF)
     97             return nonCharacter;
     98         return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
     99     }
    100     if (length == 3) {
    101         ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
    102         switch (sequence[0]) {
    103         case 0xE0:
    104             if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
    105                 return nonCharacter;
    106             break;
    107         case 0xED:
    108             if (sequence[1] < 0x80 || sequence[1] > 0x9F)
    109                 return nonCharacter;
    110             break;
    111         default:
    112             if (sequence[1] < 0x80 || sequence[1] > 0xBF)
    113                 return nonCharacter;
    114         }
    115         if (sequence[2] < 0x80 || sequence[2] > 0xBF)
    116             return nonCharacter;
    117         return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
    118     }
    119     ASSERT(length == 4);
    120     ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
    121     switch (sequence[0]) {
    122     case 0xF0:
    123         if (sequence[1] < 0x90 || sequence[1] > 0xBF)
    124             return nonCharacter;
    125         break;
    126     case 0xF4:
    127         if (sequence[1] < 0x80 || sequence[1] > 0x8F)
    128             return nonCharacter;
    129         break;
    130     default:
    131         if (sequence[1] < 0x80 || sequence[1] > 0xBF)
    132             return nonCharacter;
    133     }
    134     if (sequence[2] < 0x80 || sequence[2] > 0xBF)
    135         return nonCharacter;
    136     if (sequence[3] < 0x80 || sequence[3] > 0xBF)
    137         return nonCharacter;
    138     return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
    139 }
    140 
    141 static inline UChar* appendCharacter(UChar* destination, int character)
    142 {
    143     ASSERT(character != nonCharacter);
    144     ASSERT(!U_IS_SURROGATE(character));
    145     if (U_IS_BMP(character))
    146         *destination++ = character;
    147     else {
    148         *destination++ = U16_LEAD(character);
    149         *destination++ = U16_TRAIL(character);
    150     }
    151     return destination;
    152 }
    153 
    154 void TextCodecUTF8::consumePartialSequenceByte()
    155 {
    156     --m_partialSequenceSize;
    157     memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
    158 }
    159 
    160 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
    161 {
    162     sawError = true;
    163     if (stopOnError)
    164         return;
    165     // Each error generates a replacement character and consumes one byte.
    166     *destination++ = replacementCharacter;
    167     consumePartialSequenceByte();
    168 }
    169 
    170 template <>
    171 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)
    172 {
    173     ASSERT(m_partialSequenceSize);
    174     do {
    175         if (isASCII(m_partialSequence[0])) {
    176             *destination++ = m_partialSequence[0];
    177             consumePartialSequenceByte();
    178             continue;
    179         }
    180         int count = nonASCIISequenceLength(m_partialSequence[0]);
    181         if (!count)
    182             return true;
    183 
    184         if (count > m_partialSequenceSize) {
    185             if (count - m_partialSequenceSize > end - source) {
    186                 if (!flush) {
    187                     // The new data is not enough to complete the sequence, so
    188                     // add it to the existing partial sequence.
    189                     memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
    190                     m_partialSequenceSize += end - source;
    191                     return false;
    192                 }
    193                 // An incomplete partial sequence at the end is an error, but it will create
    194                 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
    195                 // the error.
    196                 return true;
    197             }
    198             memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
    199             source += count - m_partialSequenceSize;
    200             m_partialSequenceSize = count;
    201         }
    202         int character = decodeNonASCIISequence(m_partialSequence, count);
    203         if ((character == nonCharacter) || (character > 0xff))
    204             return true;
    205 
    206         m_partialSequenceSize -= count;
    207         *destination++ = character;
    208     } while (m_partialSequenceSize);
    209 
    210     return false;
    211 }
    212 
    213 template <>
    214 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
    215 {
    216     ASSERT(m_partialSequenceSize);
    217     do {
    218         if (isASCII(m_partialSequence[0])) {
    219             *destination++ = m_partialSequence[0];
    220             consumePartialSequenceByte();
    221             continue;
    222         }
    223         int count = nonASCIISequenceLength(m_partialSequence[0]);
    224         if (!count) {
    225             handleError(destination, stopOnError, sawError);
    226             if (stopOnError)
    227                 return false;
    228             continue;
    229         }
    230         if (count > m_partialSequenceSize) {
    231             if (count - m_partialSequenceSize > end - source) {
    232                 if (!flush) {
    233                     // The new data is not enough to complete the sequence, so
    234                     // add it to the existing partial sequence.
    235                     memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
    236                     m_partialSequenceSize += end - source;
    237                     return false;
    238                 }
    239                 // An incomplete partial sequence at the end is an error.
    240                 handleError(destination, stopOnError, sawError);
    241                 if (stopOnError)
    242                     return false;
    243                 continue;
    244             }
    245             memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
    246             source += count - m_partialSequenceSize;
    247             m_partialSequenceSize = count;
    248         }
    249         int character = decodeNonASCIISequence(m_partialSequence, count);
    250         if (character == nonCharacter) {
    251             handleError(destination, stopOnError, sawError);
    252             if (stopOnError)
    253                 return false;
    254             continue;
    255         }
    256 
    257         m_partialSequenceSize -= count;
    258         destination = appendCharacter(destination, character);
    259     } while (m_partialSequenceSize);
    260 
    261     return false;
    262 }
    263 
    264 String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
    265 {
    266     // Each input byte might turn into a character.
    267     // That includes all bytes in the partial-sequence buffer because
    268     // each byte in an invalid sequence will turn into a replacement character.
    269     StringBuffer<LChar> buffer(m_partialSequenceSize + length);
    270 
    271     const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
    272     const uint8_t* end = source + length;
    273     const uint8_t* alignedEnd = alignToMachineWord(end);
    274     LChar* destination = buffer.characters();
    275 
    276     do {
    277         if (m_partialSequenceSize) {
    278             // Explicitly copy destination and source pointers to avoid taking pointers to the
    279             // local variables, which may harm code generation by disabling some optimizations
    280             // in some compilers.
    281             LChar* destinationForHandlePartialSequence = destination;
    282             const uint8_t* sourceForHandlePartialSequence = source;
    283             if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
    284                 source = sourceForHandlePartialSequence;
    285                 goto upConvertTo16Bit;
    286             }
    287             destination = destinationForHandlePartialSequence;
    288             source = sourceForHandlePartialSequence;
    289             if (m_partialSequenceSize)
    290                 break;
    291         }
    292 
    293         while (source < end) {
    294             if (isASCII(*source)) {
    295                 // Fast path for ASCII. Most UTF-8 text will be ASCII.
    296                 if (isAlignedToMachineWord(source)) {
    297                     while (source < alignedEnd) {
    298                         MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
    299                         if (!isAllASCII<LChar>(chunk))
    300                             break;
    301                         copyASCIIMachineWord(destination, source);
    302                         source += sizeof(MachineWord);
    303                         destination += sizeof(MachineWord);
    304                     }
    305                     if (source == end)
    306                         break;
    307                     if (!isASCII(*source))
    308                         continue;
    309                 }
    310                 *destination++ = *source++;
    311                 continue;
    312             }
    313             int count = nonASCIISequenceLength(*source);
    314             int character;
    315             if (!count)
    316                 character = nonCharacter;
    317             else {
    318                 if (count > end - source) {
    319                     ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
    320                     ASSERT(!m_partialSequenceSize);
    321                     m_partialSequenceSize = end - source;
    322                     memcpy(m_partialSequence, source, m_partialSequenceSize);
    323                     source = end;
    324                     break;
    325                 }
    326                 character = decodeNonASCIISequence(source, count);
    327             }
    328             if (character == nonCharacter) {
    329                 sawError = true;
    330                 if (stopOnError)
    331                     break;
    332 
    333                 goto upConvertTo16Bit;
    334             }
    335             if (character > 0xff)
    336                 goto upConvertTo16Bit;
    337 
    338             source += count;
    339             *destination++ = character;
    340         }
    341     } while (flush && m_partialSequenceSize);
    342 
    343     buffer.shrink(destination - buffer.characters());
    344 
    345     return String::adopt(buffer);
    346 
    347 upConvertTo16Bit:
    348     StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
    349 
    350     UChar* destination16 = buffer16.characters();
    351 
    352     // Copy the already converted characters
    353     for (LChar* converted8 = buffer.characters(); converted8 < destination;)
    354         *destination16++ = *converted8++;
    355 
    356     do {
    357         if (m_partialSequenceSize) {
    358             // Explicitly copy destination and source pointers to avoid taking pointers to the
    359             // local variables, which may harm code generation by disabling some optimizations
    360             // in some compilers.
    361             UChar* destinationForHandlePartialSequence = destination16;
    362             const uint8_t* sourceForHandlePartialSequence = source;
    363             handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
    364             destination16 = destinationForHandlePartialSequence;
    365             source = sourceForHandlePartialSequence;
    366             if (m_partialSequenceSize)
    367                 break;
    368         }
    369 
    370         while (source < end) {
    371             if (isASCII(*source)) {
    372                 // Fast path for ASCII. Most UTF-8 text will be ASCII.
    373                 if (isAlignedToMachineWord(source)) {
    374                     while (source < alignedEnd) {
    375                         MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
    376                         if (!isAllASCII<LChar>(chunk))
    377                             break;
    378                         copyASCIIMachineWord(destination16, source);
    379                         source += sizeof(MachineWord);
    380                         destination16 += sizeof(MachineWord);
    381                     }
    382                     if (source == end)
    383                         break;
    384                     if (!isASCII(*source))
    385                         continue;
    386                 }
    387                 *destination16++ = *source++;
    388                 continue;
    389             }
    390             int count = nonASCIISequenceLength(*source);
    391             int character;
    392             if (!count)
    393                 character = nonCharacter;
    394             else {
    395                 if (count > end - source) {
    396                     ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
    397                     ASSERT(!m_partialSequenceSize);
    398                     m_partialSequenceSize = end - source;
    399                     memcpy(m_partialSequence, source, m_partialSequenceSize);
    400                     source = end;
    401                     break;
    402                 }
    403                 character = decodeNonASCIISequence(source, count);
    404             }
    405             if (character == nonCharacter) {
    406                 sawError = true;
    407                 if (stopOnError)
    408                     break;
    409                 // Each error generates a replacement character and consumes one byte.
    410                 *destination16++ = replacementCharacter;
    411                 ++source;
    412                 continue;
    413             }
    414             source += count;
    415             destination16 = appendCharacter(destination16, character);
    416         }
    417     } while (flush && m_partialSequenceSize);
    418 
    419     buffer16.shrink(destination16 - buffer16.characters());
    420 
    421     return String::adopt(buffer16);
    422 }
    423 
    424 template<typename CharType>
    425 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length)
    426 {
    427     // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
    428     // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
    429     // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
    430     if (length > numeric_limits<size_t>::max() / 3)
    431         CRASH();
    432     Vector<uint8_t> bytes(length * 3);
    433 
    434     size_t i = 0;
    435     size_t bytesWritten = 0;
    436     while (i < length) {
    437         UChar32 character;
    438         U16_NEXT(characters, i, length, character);
    439         U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
    440     }
    441 
    442     return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
    443 }
    444 
    445 CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
    446 {
    447     return encodeCommon(characters, length);
    448 }
    449 
    450 CString TextCodecUTF8::encode(const LChar* characters, size_t length, UnencodableHandling)
    451 {
    452     return encodeCommon(characters, length);
    453 }
    454 
    455 } // namespace WTF
    456