Home | History | Annotate | Download | only in unicode
      1 /*
      2  * Copyright (C) 2007 Apple Inc.  All rights reserved.
      3  * Copyright (C) 2010 Patrick Gansterer <paroga (at) paroga.com>
      4  *
      5  * Redistribution and use in source and binary forms, with or without
      6  * modification, are permitted provided that the following conditions
      7  * are met:
      8  * 1. Redistributions of source code must retain the above copyright
      9  *    notice, this list of conditions and the following disclaimer.
     10  * 2. Redistributions in binary form must reproduce the above copyright
     11  *    notice, this list of conditions and the following disclaimer in the
     12  *    documentation and/or other materials provided with the distribution.
     13  *
     14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25  */
     26 
     27 #include "config.h"
     28 #include "wtf/unicode/UTF8.h"
     29 
     30 #include "wtf/ASCIICType.h"
     31 #include "wtf/StringHasher.h"
     32 #include "wtf/unicode/CharacterNames.h"
     33 
     34 namespace WTF {
     35 namespace Unicode {
     36 
     37 inline int inlineUTF8SequenceLengthNonASCII(char b0)
     38 {
     39     if ((b0 & 0xC0) != 0xC0)
     40         return 0;
     41     if ((b0 & 0xE0) == 0xC0)
     42         return 2;
     43     if ((b0 & 0xF0) == 0xE0)
     44         return 3;
     45     if ((b0 & 0xF8) == 0xF0)
     46         return 4;
     47     return 0;
     48 }
     49 
     50 inline int inlineUTF8SequenceLength(char b0)
     51 {
     52     return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
     53 }
     54 
     55 int UTF8SequenceLength(char b0)
     56 {
     57     return isASCII(b0) ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
     58 }
     59 
     60 int decodeUTF8Sequence(const char* sequence)
     61 {
     62     // Handle 0-byte sequences (never valid).
     63     const unsigned char b0 = sequence[0];
     64     const int length = inlineUTF8SequenceLength(b0);
     65     if (length == 0)
     66         return -1;
     67 
     68     // Handle 1-byte sequences (plain ASCII).
     69     const unsigned char b1 = sequence[1];
     70     if (length == 1) {
     71         if (b1)
     72             return -1;
     73         return b0;
     74     }
     75 
     76     // Handle 2-byte sequences.
     77     if ((b1 & 0xC0) != 0x80)
     78         return -1;
     79     const unsigned char b2 = sequence[2];
     80     if (length == 2) {
     81         if (b2)
     82             return -1;
     83         const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
     84         if (c < 0x80)
     85             return -1;
     86         return c;
     87     }
     88 
     89     // Handle 3-byte sequences.
     90     if ((b2 & 0xC0) != 0x80)
     91         return -1;
     92     const unsigned char b3 = sequence[3];
     93     if (length == 3) {
     94         if (b3)
     95             return -1;
     96         const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
     97         if (c < 0x800)
     98             return -1;
     99         // UTF-16 surrogates should never appear in UTF-8 data.
    100         if (c >= 0xD800 && c <= 0xDFFF)
    101             return -1;
    102         return c;
    103     }
    104 
    105     // Handle 4-byte sequences.
    106     if ((b3 & 0xC0) != 0x80)
    107         return -1;
    108     const unsigned char b4 = sequence[4];
    109     if (length == 4) {
    110         if (b4)
    111             return -1;
    112         const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
    113         if (c < 0x10000 || c > 0x10FFFF)
    114             return -1;
    115         return c;
    116     }
    117 
    118     return -1;
    119 }
    120 
    121 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
    122 // into the first byte, depending on how many bytes follow.  There are
    123 // as many entries in this table as there are UTF-8 sequence types.
    124 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs
    125 // for *legal* UTF-8 will be 4 or fewer bytes total.
    126 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
    127 
    128 ConversionResult convertLatin1ToUTF8(
    129                                      const LChar** sourceStart, const LChar* sourceEnd,
    130                                      char** targetStart, char* targetEnd)
    131 {
    132     ConversionResult result = conversionOK;
    133     const LChar* source = *sourceStart;
    134     char* target = *targetStart;
    135     while (source < sourceEnd) {
    136         UChar32 ch;
    137         unsigned short bytesToWrite = 0;
    138         const UChar32 byteMask = 0xBF;
    139         const UChar32 byteMark = 0x80;
    140         const LChar* oldSource = source; // In case we have to back up because of target overflow.
    141         ch = static_cast<unsigned short>(*source++);
    142 
    143         // Figure out how many bytes the result will require
    144         if (ch < (UChar32)0x80)
    145             bytesToWrite = 1;
    146         else
    147             bytesToWrite = 2;
    148 
    149         target += bytesToWrite;
    150         if (target > targetEnd) {
    151             source = oldSource; // Back up source pointer!
    152             target -= bytesToWrite;
    153             result = targetExhausted;
    154             break;
    155         }
    156         switch (bytesToWrite) { // note: everything falls through.
    157         case 2:
    158             *--target = (char)((ch | byteMark) & byteMask);
    159             ch >>= 6;
    160         case 1:
    161             *--target =  (char)(ch | firstByteMark[bytesToWrite]);
    162         }
    163         target += bytesToWrite;
    164     }
    165     *sourceStart = source;
    166     *targetStart = target;
    167     return result;
    168 }
    169 
    170 ConversionResult convertUTF16ToUTF8(
    171     const UChar** sourceStart, const UChar* sourceEnd,
    172     char** targetStart, char* targetEnd, bool strict)
    173 {
    174     ConversionResult result = conversionOK;
    175     const UChar* source = *sourceStart;
    176     char* target = *targetStart;
    177     while (source < sourceEnd) {
    178         UChar32 ch;
    179         unsigned short bytesToWrite = 0;
    180         const UChar32 byteMask = 0xBF;
    181         const UChar32 byteMark = 0x80;
    182         const UChar* oldSource = source; // In case we have to back up because of target overflow.
    183         ch = static_cast<unsigned short>(*source++);
    184         // If we have a surrogate pair, convert to UChar32 first.
    185         if (ch >= 0xD800 && ch <= 0xDBFF) {
    186             // If the 16 bits following the high surrogate are in the source buffer...
    187             if (source < sourceEnd) {
    188                 UChar32 ch2 = static_cast<unsigned short>(*source);
    189                 // If it's a low surrogate, convert to UChar32.
    190                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
    191                     ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
    192                     ++source;
    193                 } else if (strict) { // it's an unpaired high surrogate
    194                     --source; // return to the illegal value itself
    195                     result = sourceIllegal;
    196                     break;
    197                 }
    198             } else { // We don't have the 16 bits following the high surrogate.
    199                 --source; // return to the high surrogate
    200                 result = sourceExhausted;
    201                 break;
    202             }
    203         } else if (strict) {
    204             // UTF-16 surrogate values are illegal in UTF-32
    205             if (ch >= 0xDC00 && ch <= 0xDFFF) {
    206                 --source; // return to the illegal value itself
    207                 result = sourceIllegal;
    208                 break;
    209             }
    210         }
    211         // Figure out how many bytes the result will require
    212         if (ch < (UChar32)0x80) {
    213             bytesToWrite = 1;
    214         } else if (ch < (UChar32)0x800) {
    215             bytesToWrite = 2;
    216         } else if (ch < (UChar32)0x10000) {
    217             bytesToWrite = 3;
    218         } else if (ch < (UChar32)0x110000) {
    219             bytesToWrite = 4;
    220         } else {
    221             bytesToWrite = 3;
    222             ch = replacementCharacter;
    223         }
    224 
    225         target += bytesToWrite;
    226         if (target > targetEnd) {
    227             source = oldSource; // Back up source pointer!
    228             target -= bytesToWrite;
    229             result = targetExhausted;
    230             break;
    231         }
    232         switch (bytesToWrite) { // note: everything falls through.
    233             case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
    234             case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
    235             case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
    236             case 1: *--target =  (char)(ch | firstByteMark[bytesToWrite]);
    237         }
    238         target += bytesToWrite;
    239     }
    240     *sourceStart = source;
    241     *targetStart = target;
    242     return result;
    243 }
    244 
    245 // This must be called with the length pre-determined by the first byte.
    246 // If presented with a length > 4, this returns false.  The Unicode
    247 // definition of UTF-8 goes up to 4-byte sequences.
    248 static bool isLegalUTF8(const unsigned char* source, int length)
    249 {
    250     unsigned char a;
    251     const unsigned char* srcptr = source + length;
    252     switch (length) {
    253         default: return false;
    254         // Everything else falls through when "true"...
    255         case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    256         case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    257         case 2: if ((a = (*--srcptr)) > 0xBF) return false;
    258 
    259         switch (*source) {
    260             // no fall-through in this inner switch
    261             case 0xE0: if (a < 0xA0) return false; break;
    262             case 0xED: if (a > 0x9F) return false; break;
    263             case 0xF0: if (a < 0x90) return false; break;
    264             case 0xF4: if (a > 0x8F) return false; break;
    265             default:   if (a < 0x80) return false;
    266         }
    267 
    268         case 1: if (*source >= 0x80 && *source < 0xC2) return false;
    269     }
    270     if (*source > 0xF4)
    271         return false;
    272     return true;
    273 }
    274 
    275 // Magic values subtracted from a buffer value during UTF8 conversion.
    276 // This table contains as many values as there might be trailing bytes
    277 // in a UTF-8 sequence.
    278 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 0x03C82080UL, static_cast<UChar32>(0xFA082080UL), static_cast<UChar32>(0x82082080UL) };
    279 
    280 static inline UChar32 readUTF8Sequence(const char*& sequence, unsigned length)
    281 {
    282     UChar32 character = 0;
    283 
    284     // The cases all fall through.
    285     switch (length) {
    286         case 6: character += static_cast<unsigned char>(*sequence++); character <<= 6;
    287         case 5: character += static_cast<unsigned char>(*sequence++); character <<= 6;
    288         case 4: character += static_cast<unsigned char>(*sequence++); character <<= 6;
    289         case 3: character += static_cast<unsigned char>(*sequence++); character <<= 6;
    290         case 2: character += static_cast<unsigned char>(*sequence++); character <<= 6;
    291         case 1: character += static_cast<unsigned char>(*sequence++);
    292     }
    293 
    294     return character - offsetsFromUTF8[length - 1];
    295 }
    296 
    297 ConversionResult convertUTF8ToUTF16(
    298     const char** sourceStart, const char* sourceEnd,
    299     UChar** targetStart, UChar* targetEnd, bool* sourceAllASCII, bool strict)
    300 {
    301     ConversionResult result = conversionOK;
    302     const char* source = *sourceStart;
    303     UChar* target = *targetStart;
    304     UChar orAllData = 0;
    305     while (source < sourceEnd) {
    306         int utf8SequenceLength = inlineUTF8SequenceLength(*source);
    307         if (sourceEnd - source < utf8SequenceLength)  {
    308             result = sourceExhausted;
    309             break;
    310         }
    311         // Do this check whether lenient or strict
    312         if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), utf8SequenceLength)) {
    313             result = sourceIllegal;
    314             break;
    315         }
    316 
    317         UChar32 character = readUTF8Sequence(source, utf8SequenceLength);
    318 
    319         if (target >= targetEnd) {
    320             source -= utf8SequenceLength; // Back up source pointer!
    321             result = targetExhausted;
    322             break;
    323         }
    324 
    325         if (U_IS_BMP(character)) {
    326             // UTF-16 surrogate values are illegal in UTF-32
    327             if (U_IS_SURROGATE(character)) {
    328                 if (strict) {
    329                     source -= utf8SequenceLength; // return to the illegal value itself
    330                     result = sourceIllegal;
    331                     break;
    332                 } else {
    333                     *target++ = replacementCharacter;
    334                     orAllData |= replacementCharacter;
    335                 }
    336             } else {
    337                 *target++ = character; // normal case
    338                 orAllData |= character;
    339             }
    340         } else if (U_IS_SUPPLEMENTARY(character)) {
    341             // target is a character in range 0xFFFF - 0x10FFFF
    342             if (target + 1 >= targetEnd) {
    343                 source -= utf8SequenceLength; // Back up source pointer!
    344                 result = targetExhausted;
    345                 break;
    346             }
    347             *target++ = U16_LEAD(character);
    348             *target++ = U16_TRAIL(character);
    349             orAllData = 0xffff;
    350         } else {
    351             if (strict) {
    352                 source -= utf8SequenceLength; // return to the start
    353                 result = sourceIllegal;
    354                 break; // Bail out; shouldn't continue
    355             } else {
    356                 *target++ = replacementCharacter;
    357                 orAllData |= replacementCharacter;
    358             }
    359         }
    360     }
    361     *sourceStart = source;
    362     *targetStart = target;
    363 
    364     if (sourceAllASCII)
    365         *sourceAllASCII = !(orAllData & ~0x7f);
    366 
    367     return result;
    368 }
    369 
    370 unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length)
    371 {
    372     if (!data)
    373         return 0;
    374 
    375     StringHasher stringHasher;
    376     dataLength = 0;
    377     utf16Length = 0;
    378 
    379     while (data < dataEnd || (!dataEnd && *data)) {
    380         if (isASCII(*data)) {
    381             stringHasher.addCharacter(*data++);
    382             dataLength++;
    383             utf16Length++;
    384             continue;
    385         }
    386 
    387         int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data);
    388         dataLength += utf8SequenceLength;
    389 
    390         if (!dataEnd) {
    391             for (int i = 1; i < utf8SequenceLength; ++i) {
    392                 if (!data[i])
    393                     return 0;
    394             }
    395         } else if (dataEnd - data < utf8SequenceLength)
    396             return 0;
    397 
    398         if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength))
    399             return 0;
    400 
    401         UChar32 character = readUTF8Sequence(data, utf8SequenceLength);
    402         ASSERT(!isASCII(character));
    403 
    404         if (U_IS_BMP(character)) {
    405             // UTF-16 surrogate values are illegal in UTF-32
    406             if (U_IS_SURROGATE(character))
    407                 return 0;
    408             stringHasher.addCharacter(static_cast<UChar>(character)); // normal case
    409             utf16Length++;
    410         } else if (U_IS_SUPPLEMENTARY(character)) {
    411             stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)),
    412                                        static_cast<UChar>(U16_TRAIL(character)));
    413             utf16Length += 2;
    414         } else
    415             return 0;
    416     }
    417 
    418     return stringHasher.hashWithTop8BitsMasked();
    419 }
    420 
    421 template<typename CharType>
    422 ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd, const char* b, const char* bEnd)
    423 {
    424     while (b < bEnd) {
    425         if (isASCII(*b)) {
    426             if (*a++ != *b++)
    427                 return false;
    428             continue;
    429         }
    430 
    431         int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b);
    432 
    433         if (bEnd - b < utf8SequenceLength)
    434             return false;
    435 
    436         if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength))
    437             return 0;
    438 
    439         UChar32 character = readUTF8Sequence(b, utf8SequenceLength);
    440         ASSERT(!isASCII(character));
    441 
    442         if (U_IS_BMP(character)) {
    443             // UTF-16 surrogate values are illegal in UTF-32
    444             if (U_IS_SURROGATE(character))
    445                 return false;
    446             if (*a++ != character)
    447                 return false;
    448         } else if (U_IS_SUPPLEMENTARY(character)) {
    449             if (*a++ != U16_LEAD(character))
    450                 return false;
    451             if (*a++ != U16_TRAIL(character))
    452                 return false;
    453         } else
    454             return false;
    455     }
    456 
    457     return a == aEnd;
    458 }
    459 
    460 bool equalUTF16WithUTF8(const UChar* a, const UChar* aEnd, const char* b, const char* bEnd)
    461 {
    462     return equalWithUTF8Internal(a, aEnd, b, bEnd);
    463 }
    464 
    465 bool equalLatin1WithUTF8(const LChar* a, const LChar* aEnd, const char* b, const char* bEnd)
    466 {
    467     return equalWithUTF8Internal(a, aEnd, b, bEnd);
    468 }
    469 
    470 } // namespace Unicode
    471 } // namespace WTF
    472