Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2004, 2006, 2008, 2011 Apple Inc. All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #include "config.h"
     27 #include "TextCodecUTF8.h"
     28 
     29 #include "TextCodecASCIIFastPath.h"
     30 #include <wtf/text/CString.h>
     31 #include <wtf/text/StringBuffer.h>
     32 #include <wtf/unicode/CharacterNames.h>
     33 
     34 using namespace WTF::Unicode;
     35 using namespace std;
     36 
     37 namespace WebCore {
     38 
     39 const int nonCharacter = -1;
     40 
     41 PassOwnPtr<TextCodec> TextCodecUTF8::create(const TextEncoding&, const void*)
     42 {
     43     return adoptPtr(new TextCodecUTF8);
     44 }
     45 
     46 void TextCodecUTF8::registerEncodingNames(EncodingNameRegistrar registrar)
     47 {
     48     registrar("UTF-8", "UTF-8");
     49 
     50     // Additional aliases that originally were present in the encoding
     51     // table in WebKit on Macintosh, and subsequently added by
     52     // TextCodecICU. Perhaps we can prove some are not used on the web
     53     // and remove them.
     54     registrar("unicode11utf8", "UTF-8");
     55     registrar("unicode20utf8", "UTF-8");
     56     registrar("utf8", "UTF-8");
     57     registrar("x-unicode20utf8", "UTF-8");
     58 }
     59 
     60 void TextCodecUTF8::registerCodecs(TextCodecRegistrar registrar)
     61 {
     62     registrar("UTF-8", create, 0);
     63 }
     64 
     65 static inline int nonASCIISequenceLength(uint8_t firstByte)
     66 {
     67     static const uint8_t lengths[256] = {
     68         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     69         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     70         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     71         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     72         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     73         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     74         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     75         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     76         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     77         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     78         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     79         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     80         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     81         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     82         3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
     83         4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
     84     };
     85     return lengths[firstByte];
     86 }
     87 
     88 static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length)
     89 {
     90     ASSERT(!isASCII(sequence[0]));
     91     if (length == 2) {
     92         ASSERT(sequence[0] <= 0xDF);
     93         if (sequence[0] < 0xC2)
     94             return nonCharacter;
     95         if (sequence[1] < 0x80 || sequence[1] > 0xBF)
     96             return nonCharacter;
     97         return ((sequence[0] << 6) + sequence[1]) - 0x00003080;
     98     }
     99     if (length == 3) {
    100         ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF);
    101         switch (sequence[0]) {
    102         case 0xE0:
    103             if (sequence[1] < 0xA0 || sequence[1] > 0xBF)
    104                 return nonCharacter;
    105             break;
    106         case 0xED:
    107             if (sequence[1] < 0x80 || sequence[1] > 0x9F)
    108                 return nonCharacter;
    109             break;
    110         default:
    111             if (sequence[1] < 0x80 || sequence[1] > 0xBF)
    112                 return nonCharacter;
    113         }
    114         if (sequence[2] < 0x80 || sequence[2] > 0xBF)
    115             return nonCharacter;
    116         return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080;
    117     }
    118     ASSERT(length == 4);
    119     ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4);
    120     switch (sequence[0]) {
    121     case 0xF0:
    122         if (sequence[1] < 0x90 || sequence[1] > 0xBF)
    123             return nonCharacter;
    124         break;
    125     case 0xF4:
    126         if (sequence[1] < 0x80 || sequence[1] > 0x8F)
    127             return nonCharacter;
    128         break;
    129     default:
    130         if (sequence[1] < 0x80 || sequence[1] > 0xBF)
    131             return nonCharacter;
    132     }
    133     if (sequence[2] < 0x80 || sequence[2] > 0xBF)
    134         return nonCharacter;
    135     if (sequence[3] < 0x80 || sequence[3] > 0xBF)
    136         return nonCharacter;
    137     return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080;
    138 }
    139 
    140 static inline UChar* appendCharacter(UChar* destination, int character)
    141 {
    142     ASSERT(character != nonCharacter);
    143     ASSERT(!U_IS_SURROGATE(character));
    144     if (U_IS_BMP(character))
    145         *destination++ = character;
    146     else {
    147         *destination++ = U16_LEAD(character);
    148         *destination++ = U16_TRAIL(character);
    149     }
    150     return destination;
    151 }
    152 
    153 void TextCodecUTF8::consumePartialSequenceByte()
    154 {
    155     --m_partialSequenceSize;
    156     memmove(m_partialSequence, m_partialSequence + 1, m_partialSequenceSize);
    157 }
    158 
    159 void TextCodecUTF8::handleError(UChar*& destination, bool stopOnError, bool& sawError)
    160 {
    161     sawError = true;
    162     if (stopOnError)
    163         return;
    164     // Each error generates a replacement character and consumes one byte.
    165     *destination++ = replacementCharacter;
    166     consumePartialSequenceByte();
    167 }
    168 
    169 void TextCodecUTF8::handlePartialSequence(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError)
    170 {
    171     ASSERT(m_partialSequenceSize);
    172     do {
    173         if (isASCII(m_partialSequence[0])) {
    174             *destination++ = m_partialSequence[0];
    175             consumePartialSequenceByte();
    176             continue;
    177         }
    178         int count = nonASCIISequenceLength(m_partialSequence[0]);
    179         if (!count) {
    180             handleError(destination, stopOnError, sawError);
    181             if (stopOnError)
    182                 return;
    183             continue;
    184         }
    185         if (count > m_partialSequenceSize) {
    186             if (count - m_partialSequenceSize > end - source) {
    187                 if (!flush) {
    188                     // The new data is not enough to complete the sequence, so
    189                     // add it to the existing partial sequence.
    190                     memcpy(m_partialSequence + m_partialSequenceSize, source, end - source);
    191                     m_partialSequenceSize += end - source;
    192                     return;
    193                 }
    194                 // An incomplete partial sequence at the end is an error.
    195                 handleError(destination, stopOnError, sawError);
    196                 if (stopOnError)
    197                     return;
    198                 continue;
    199             }
    200             memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize);
    201             source += count - m_partialSequenceSize;
    202             m_partialSequenceSize = count;
    203         }
    204         int character = decodeNonASCIISequence(m_partialSequence, count);
    205         if (character == nonCharacter) {
    206             handleError(destination, stopOnError, sawError);
    207             if (stopOnError)
    208                 return;
    209             continue;
    210         }
    211         m_partialSequenceSize -= count;
    212         destination = appendCharacter(destination, character);
    213     } while (m_partialSequenceSize);
    214 }
    215 
    216 String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError)
    217 {
    218     // Each input byte might turn into a character.
    219     // That includes all bytes in the partial-sequence buffer because
    220     // each byte in an invalid sequence will turn into a replacement character.
    221     StringBuffer buffer(m_partialSequenceSize + length);
    222 
    223     const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes);
    224     const uint8_t* end = source + length;
    225     const uint8_t* alignedEnd = alignToMachineWord(end);
    226     UChar* destination = buffer.characters();
    227 
    228     do {
    229         if (m_partialSequenceSize) {
    230             // Explicitly copy destination and source pointers to avoid taking pointers to the
    231             // local variables, which may harm code generation by disabling some optimizations
    232             // in some compilers.
    233             UChar* destinationForHandlePartialSequence = destination;
    234             const uint8_t* sourceForHandlePartialSequence = source;
    235             handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError);
    236             destination = destinationForHandlePartialSequence;
    237             source = sourceForHandlePartialSequence;
    238             if (m_partialSequenceSize)
    239                 break;
    240         }
    241 
    242         while (source < end) {
    243             if (isASCII(*source)) {
    244                 // Fast path for ASCII. Most UTF-8 text will be ASCII.
    245                 if (isAlignedToMachineWord(source)) {
    246                     while (source < alignedEnd) {
    247                         MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source);
    248                         if (!isAllASCII(chunk))
    249                             break;
    250                         copyASCIIMachineWord(destination, source);
    251                         source += sizeof(MachineWord);
    252                         destination += sizeof(MachineWord);
    253                     }
    254                     if (source == end)
    255                         break;
    256                     if (!isASCII(*source))
    257                         continue;
    258                 }
    259                 *destination++ = *source++;
    260                 continue;
    261             }
    262             int count = nonASCIISequenceLength(*source);
    263             int character;
    264             if (!count)
    265                 character = nonCharacter;
    266             else {
    267                 if (count > end - source) {
    268                     ASSERT(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence)));
    269                     ASSERT(!m_partialSequenceSize);
    270                     m_partialSequenceSize = end - source;
    271                     memcpy(m_partialSequence, source, m_partialSequenceSize);
    272                     source = end;
    273                     break;
    274                 }
    275                 character = decodeNonASCIISequence(source, count);
    276             }
    277             if (character == nonCharacter) {
    278                 sawError = true;
    279                 if (stopOnError)
    280                     break;
    281                 // Each error generates a replacement character and consumes one byte.
    282                 *destination++ = replacementCharacter;
    283                 ++source;
    284                 continue;
    285             }
    286             source += count;
    287             destination = appendCharacter(destination, character);
    288         }
    289     } while (flush && m_partialSequenceSize);
    290 
    291     buffer.shrink(destination - buffer.characters());
    292 
    293     return String::adopt(buffer);
    294 }
    295 
    296 CString TextCodecUTF8::encode(const UChar* characters, size_t length, UnencodableHandling)
    297 {
    298     // The maximum number of UTF-8 bytes needed per UTF-16 code unit is 3.
    299     // BMP characters take only one UTF-16 code unit and can take up to 3 bytes (3x).
    300     // Non-BMP characters take two UTF-16 code units and can take up to 4 bytes (2x).
    301     if (length > numeric_limits<size_t>::max() / 3)
    302         CRASH();
    303     Vector<uint8_t> bytes(length * 3);
    304 
    305     size_t i = 0;
    306     size_t bytesWritten = 0;
    307     while (i < length) {
    308         UChar32 character;
    309         U16_NEXT(characters, i, length, character);
    310         U8_APPEND_UNSAFE(bytes.data(), bytesWritten, character);
    311     }
    312 
    313     return CString(reinterpret_cast<char*>(bytes.data()), bytesWritten);
    314 }
    315 
    316 } // namespace WebCore
    317