Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #include "config.h"
     27 #include "wtf/text/TextCodecUTF8.h"
     28 
     29 #include "wtf/text/TextCodecASCIIFastPath.h"
     30 #include "wtf/text/CString.h"
     31 #include "wtf/text/StringBuffer.h"
     32 #include "wtf/unicode/CharacterNames.h"
     33 
     34 using namespace WTF;
     35 using namespace WTF::Unicode;
     36 using namespace std;
     37 
     38 namespace WTF {
     39 
     40 const int nonCharacter = -1;
     41 
     42 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
     43 {
     44     return adoptPtr(new TextCodecUTF8);
     45 }
     46 
     47 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
     48 {
     49     registrar("UTF-8", "UTF-8");
     50 
     51     // Additional aliases that originally were present in the encoding
     52     // table in WebKit on Macintosh, and subsequently added by
     53     // TextCodecICU. Perhaps we can prove some are not used on the web
     54     // and remove them.
     55     registrar("unicode11utf8", "UTF-8");
     56     registrar("unicode20utf8", "UTF-8");
     57     registrar("utf8", "UTF-8");
     58     registrar("x-unicode20utf8", "UTF-8");
     59 
     60     // Additional aliases present in the WHATWG Encoding Standard (http://encoding.spec.whatwg.org/)
     61     // and Firefox (24), but not in ICU 4.6.
     62     registrar("unicode-1-1-utf-8", "UTF-8");
     63 }
     64 
     65 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
     66 {
     67     registrar("UTF-8", create, 0);
     68 }
     69 
     70 static inline int nonASCIISequenceLength(uint8_t firstByte)
     71 {
     72     static const uint8_t lengths[256] = {
     73         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     74         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     75         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     76         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     77         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     78         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     79         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     80         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     81         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     82         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     83         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     84         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     85         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     86         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     87         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
     88         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
     89     };
     90     return lengths[firstByte];
     91 }
     92 
     93 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
     94 {
     95     ASSERT(!isASCII(sequence[0]));
     96     if (length == 2) {
     97         ASSERT(sequence[0] <= 0xDF);
     98         if (sequence[0] < 0xC2)
     99             return nonCharacter;
    100         if (sequence[1] < 0x80 || sequence[1] > 0xBF)
    101             return nonCharacter;
    102         return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
    103     }
    104     if (length == 3) {
    105         ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
    106         switch (sequence[0]) {
    107         case 0xE0:
    108             if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
    109                 return nonCharacter;
    110             break;
    111         case 0xED:
    112             if (sequence[1] < 0x80 || sequence[1] > 0x9F)
    113                 return nonCharacter;
    114             break;
    115         default:
    116             if (sequence[1] < 0x80 || sequence[1] > 0xBF)
    117                 return nonCharacter;
    118         }
    119         if (sequence[2] < 0x80 || sequence[2] > 0xBF)
    120             return nonCharacter;
    121         return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
    122     }
    123     ASSERT(length == 4);
    124     ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
    125     switch (sequence[0]) {
    126     case 0xF0:
    127         if (sequence[1] < 0x90 || sequence[1] > 0xBF)
    128             return nonCharacter;
    129         break;
    130     case 0xF4:
    131         if (sequence[1] < 0x80 || sequence[1] > 0x8F)
    132             return nonCharacter;
    133         break;
    134     default:
    135         if (sequence[1] < 0x80 || sequence[1] > 0xBF)
    136             return nonCharacter;
    137     }
    138     if (sequence[2] < 0x80 || sequence[2] > 0xBF)
    139         return nonCharacter;
    140     if (sequence[3] < 0x80 || sequence[3] > 0xBF)
    141         return nonCharacter;
    142     return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
    143 }
    144 
    145 static inline UChar* appendCharacter(UChar* destination, int character)
    146 {
    147     ASSERT(character != nonCharacter);
    148     ASSERT(!U_IS_SURROGATE(character));
    149     if (U_IS_BMP(character))
    150         *destination++ = character;
    151     else {
    152         *destination++ = U16_LEAD(character);
    153         *destination++ = U16_TRAIL(character);
    154     }
    155     return destination;
    156 }
    157 
    158 void TextCodecUTF8::consumePartialSequenceByte()
    159 {
    160     --m_partialSequenceSize;
    161     memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
    162 }
    163 
    164 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
    165 {
    166     sawError = true;
    167     if (stopOnError)
    168         return;
    169     // Each error generates a replacement character and consumes one byte.
    170     *destination++ = replacementCharacter;
    171     consumePartialSequenceByte();
    172 }
    173 
    174 template <>
    175 bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&)
    176 {
    177     ASSERT(m_partialSequenceSize);
    178     do {
    179         if (isASCII(m_partialSequence[0])) {
    180             *destination++ = m_partialSequence[0];
    181             consumePartialSequenceByte();
    182             continue;
    183         }
    184         int count = nonASCIISequenceLength(m_partialSequence[0]);
    185         if (!count)
    186             return true;
    187 
    188         if (count > m_partialSequenceSize) {
    189             if (count - m_partialSequenceSize > end - source) {
    190                 if (!flush) {
    191                     // The new data is not enough to complete the sequence, so
    192                     // add it to the existing partial sequence.
    193                     memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
    194                     m_partialSequenceSize += end - source;
    195                     return false;
    196                 }
    197                 // An incomplete partial sequence at the end is an error, but it will create
    198                 // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle
    199                 // the error.
    200                 return true;
    201             }
    202             memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
    203             source += count - m_partialSequenceSize;
    204             m_partialSequenceSize = count;
    205         }
    206         int character = decodeNonASCIISequence(m_partialSequence, count);
    207         if ((character == nonCharacter) || (character > 0xff))
    208             return true;
    209 
    210         m_partialSequenceSize -= count;
    211         *destination++ = character;
    212     } while (m_partialSequenceSize);
    213 
    214     return false;
    215 }
    216 
    217 template <>
    218 bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
    219 {
    220     ASSERT(m_partialSequenceSize);
    221     do {
    222         if (isASCII(m_partialSequence[0])) {
    223             *destination++ = m_partialSequence[0];
    224             consumePartialSequenceByte();
    225             continue;
    226         }
    227         int count = nonASCIISequenceLength(m_partialSequence[0]);
    228         if (!count) {
    229             handleError(destination, stopOnError, sawError);
    230             if (stopOnError)
    231                 return false;
    232             continue;
    233         }
    234         if (count > m_partialSequenceSize) {
    235             if (count - m_partialSequenceSize > end - source) {
    236                 if (!flush) {
    237                     // The new data is not enough to complete the sequence, so
    238                     // add it to the existing partial sequence.
    239                     memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
    240                     m_partialSequenceSize += end - source;
    241                     return false;
    242                 }
    243                 // An incomplete partial sequence at the end is an error.
    244                 handleError(destination, stopOnError, sawError);
    245                 if (stopOnError)
    246                     return false;
    247                 continue;
    248             }
    249             memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
    250             source += count - m_partialSequenceSize;
    251             m_partialSequenceSize = count;
    252         }
    253         int character = decodeNonASCIISequence(m_partialSequence, count);
    254         if (character == nonCharacter) {
    255             handleError(destination, stopOnError, sawError);
    256             if (stopOnError)
    257                 return false;
    258             continue;
    259         }
    260 
    261         m_partialSequenceSize -= count;
    262         destination = appendCharacter(destination, character);
    263     } while (m_partialSequenceSize);
    264 
    265     return false;
    266 }
    267 
    268 String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
    269 {
    270     // Each input byte might turn into a character.
    271     // That includes all bytes in the partial-sequence buffer because
    272     // each byte in an invalid sequence will turn into a replacement character.
    273     StringBuffer<LChar> buffer(m_partialSequenceSize + length);
    274 
    275     const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
    276     const uint8_t* end = source + length;
    277     const uint8_t* alignedEnd = alignToMachineWord(end);
    278     LChar* destination = buffer.characters();
    279 
    280     do {
    281         if (m_partialSequenceSize) {
    282             // Explicitly copy destination and source pointers to avoid taking pointers to the
    283             // local variables, which may harm code generation by disabling some optimizations
    284             // in some compilers.
    285             LChar* destinationForHandlePartialSequence = destination;
    286             const uint8_t* sourceForHandlePartialSequence = source;
    287             if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) {
    288                 source = sourceForHandlePartialSequence;
    289                 goto upConvertTo16Bit;
    290             }
    291             destination = destinationForHandlePartialSequence;
    292             source = sourceForHandlePartialSequence;
    293             if (m_partialSequenceSize)
    294                 break;
    295         }
    296 
    297         while (source < end) {
    298             if (isASCII(*source)) {
    299                 // Fast path for ASCII. Most UTF-8 text will be ASCII.
    300                 if (isAlignedToMachineWord(source)) {
    301                     while (source < alignedEnd) {
    302                         MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
    303                         if (!isAllASCII<LChar>(chunk))
    304                             break;
    305                         copyASCIIMachineWord(destination, source);
    306                         source += sizeof(MachineWord);
    307                         destination += sizeof(MachineWord);
    308                     }
    309                     if (source == end)
    310                         break;
    311                     if (!isASCII(*source))
    312                         continue;
    313                 }
    314                 *destination++ = *source++;
    315                 continue;
    316             }
    317             int count = nonASCIISequenceLength(*source);
    318             int character;
    319             if (!count)
    320                 character = nonCharacter;
    321             else {
    322                 if (count > end - source) {
    323                     ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
    324                     ASSERT(!m_partialSequenceSize);
    325                     m_partialSequenceSize = end - source;
    326                     memcpy(m_partialSequence, source, m_partialSequenceSize);
    327                     source = end;
    328                     break;
    329                 }
    330                 character = decodeNonASCIISequence(source, count);
    331             }
    332             if (character == nonCharacter) {
    333                 sawError = true;
    334                 if (stopOnError)
    335                     break;
    336 
    337                 goto upConvertTo16Bit;
    338             }
    339             if (character > 0xff)
    340                 goto upConvertTo16Bit;
    341 
    342             source += count;
    343             *destination++ = character;
    344         }
    345     } while (flush && m_partialSequenceSize);
    346 
    347     buffer.shrink(destination - buffer.characters());
    348 
    349     return String::adopt(buffer);
    350 
    351 upConvertTo16Bit:
    352     StringBuffer<UChar> buffer16(m_partialSequenceSize + length);
    353 
    354     UChar* destination16 = buffer16.characters();
    355 
    356     // Copy the already converted characters
    357     for (LChar* converted8 = buffer.characters(); converted8 < destination;)
    358         *destination16++ = *converted8++;
    359 
    360     do {
    361         if (m_partialSequenceSize) {
    362             // Explicitly copy destination and source pointers to avoid taking pointers to the
    363             // local variables, which may harm code generation by disabling some optimizations
    364             // in some compilers.
    365             UChar* destinationForHandlePartialSequence = destination16;
    366             const uint8_t* sourceForHandlePartialSequence = source;
    367             handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
    368             destination16 = destinationForHandlePartialSequence;
    369             source = sourceForHandlePartialSequence;
    370             if (m_partialSequenceSize)
    371                 break;
    372         }
    373 
    374         while (source < end) {
    375             if (isASCII(*source)) {
    376                 // Fast path for ASCII. Most UTF-8 text will be ASCII.
    377                 if (isAlignedToMachineWord(source)) {
    378                     while (source < alignedEnd) {
    379                         MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
    380                         if (!isAllASCII<LChar>(chunk))
    381                             break;
    382                         copyASCIIMachineWord(destination16, source);
    383                         source += sizeof(MachineWord);
    384                         destination16 += sizeof(MachineWord);
    385                     }
    386                     if (source == end)
    387                         break;
    388                     if (!isASCII(*source))
    389                         continue;
    390                 }
    391                 *destination16++ = *source++;
    392                 continue;
    393             }
    394             int count = nonASCIISequenceLength(*source);
    395             int character;
    396             if (!count)
    397                 character = nonCharacter;
    398             else {
    399                 if (count > end - source) {
    400                     ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
    401                     ASSERT(!m_partialSequenceSize);
    402                     m_partialSequenceSize = end - source;
    403                     memcpy(m_partialSequence, source, m_partialSequenceSize);
    404                     source = end;
    405                     break;
    406                 }
    407                 character = decodeNonASCIISequence(source, count);
    408             }
    409             if (character == nonCharacter) {
    410                 sawError = true;
    411                 if (stopOnError)
    412                     break;
    413                 // Each error generates a replacement character and consumes one byte.
    414                 *destination16++ = replacementCharacter;
    415                 ++source;
    416                 continue;
    417             }
    418             source += count;
    419             destination16 = appendCharacter(destination16, character);
    420         }
    421     } while (flush && m_partialSequenceSize);
    422 
    423     buffer16.shrink(destination16 - buffer16.characters());
    424 
    425     return String::adopt(buffer16);
    426 }
    427 
    428 template<typename CharType>
    429 CString TextCodecUTF8::encodeCommon(const CharType* characters, size_t length)
    430 {
    431     // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
    432     // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
    433     // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
    434     if (length > numeric_limits<size_t>::max() / 3)
    435         CRASH();
    436     Vector<uint8_t> bytes(length * 3);
    437 
    438     size_t i = 0;
    439     size_t bytesWritten = 0;
    440     while (i < length) {
    441         UChar32 character;
    442         U16_NEXT(characters, i, length, character);
    443         // U16_NEXT will simply emit a surrogate code point if an unmatched surrogate
    444         // is encountered; we must convert it to a U+FFFD (REPLACEMENT CHARACTER) here.
    445         if (0xD800 <= character && character <= 0xDFFF)
    446             character = replacementCharacter;
    447         U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
    448     }
    449 
    450     return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
    451 }
    452 
    453 CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
    454 {
    455     return encodeCommon(characters, length);
    456 }
    457 
    458 CString TextCodecUTF8::encode(const LChar* characters, size_t length, UnencodableHandling)
    459 {
    460     return encodeCommon(characters, length);
    461 }
    462 
    463 } // namespace WTF
    464