Home | History | Annotate | Download | only in unicode
      1 /*
      2  * Copyright (C) 2007 Apple Inc.  All rights reserved.
      3  *
      4  * Redistribution and use in source and binary forms, with or without
      5  * modification, are permitted provided that the following conditions
      6  * are met:
      7  * 1. Redistributions of source code must retain the above copyright
      8  *    notice, this list of conditions and the following disclaimer.
      9  * 2. Redistributions in binary form must reproduce the above copyright
     10  *    notice, this list of conditions and the following disclaimer in the
     11  *    documentation and/or other materials provided with the distribution.
     12  *
     13  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
     14  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     16  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
     17  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     18  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     19  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     20  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     21  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     23  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     24  */
     25 
     26 #include "config.h"
     27 #include "UTF8.h"
     28 
     29 namespace WTF {
     30 namespace Unicode {
     31 
     32 inline int inlineUTF8SequenceLengthNonASCII(char b0)
     33 {
     34     if ((b0 & 0xC0) != 0xC0)
     35         return 0;
     36     if ((b0 & 0xE0) == 0xC0)
     37         return 2;
     38     if ((b0 & 0xF0) == 0xE0)
     39         return 3;
     40     if ((b0 & 0xF8) == 0xF0)
     41         return 4;
     42     return 0;
     43 }
     44 
     45 inline int inlineUTF8SequenceLength(char b0)
     46 {
     47     return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
     48 }
     49 
     50 int UTF8SequenceLength(char b0)
     51 {
     52     return (b0 & 0x80) == 0 ? 1 : inlineUTF8SequenceLengthNonASCII(b0);
     53 }
     54 
     55 int decodeUTF8Sequence(const char* sequence)
     56 {
     57     // Handle 0-byte sequences (never valid).
     58     const unsigned char b0 = sequence[0];
     59     const int length = inlineUTF8SequenceLength(b0);
     60     if (length == 0)
     61         return -1;
     62 
     63     // Handle 1-byte sequences (plain ASCII).
     64     const unsigned char b1 = sequence[1];
     65     if (length == 1) {
     66         if (b1)
     67             return -1;
     68         return b0;
     69     }
     70 
     71     // Handle 2-byte sequences.
     72     if ((b1 & 0xC0) != 0x80)
     73         return -1;
     74     const unsigned char b2 = sequence[2];
     75     if (length == 2) {
     76         if (b2)
     77             return -1;
     78         const int c = ((b0 & 0x1F) << 6) | (b1 & 0x3F);
     79         if (c < 0x80)
     80             return -1;
     81         return c;
     82     }
     83 
     84     // Handle 3-byte sequences.
     85     if ((b2 & 0xC0) != 0x80)
     86         return -1;
     87     const unsigned char b3 = sequence[3];
     88     if (length == 3) {
     89         if (b3)
     90             return -1;
     91         const int c = ((b0 & 0xF) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
     92         if (c < 0x800)
     93             return -1;
     94         // UTF-16 surrogates should never appear in UTF-8 data.
     95         if (c >= 0xD800 && c <= 0xDFFF)
     96             return -1;
     97         return c;
     98     }
     99 
    100     // Handle 4-byte sequences.
    101     if ((b3 & 0xC0) != 0x80)
    102         return -1;
    103     const unsigned char b4 = sequence[4];
    104     if (length == 4) {
    105         if (b4)
    106             return -1;
    107         const int c = ((b0 & 0x7) << 18) | ((b1 & 0x3F) << 12) | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
    108         if (c < 0x10000 || c > 0x10FFFF)
    109             return -1;
    110         return c;
    111     }
    112 
    113     return -1;
    114 }
    115 
    116 // Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
    117 // into the first byte, depending on how many bytes follow.  There are
    118 // as many entries in this table as there are UTF-8 sequence types.
    119 // (I.e., one byte sequence, two byte... etc.). Remember that sequencs
    120 // for *legal* UTF-8 will be 4 or fewer bytes total.
    121 static const unsigned char firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
    122 
    123 ConversionResult convertUTF16ToUTF8(
    124     const UChar** sourceStart, const UChar* sourceEnd,
    125     char** targetStart, char* targetEnd, bool strict)
    126 {
    127     ConversionResult result = conversionOK;
    128     const UChar* source = *sourceStart;
    129     char* target = *targetStart;
    130     while (source < sourceEnd) {
    131         UChar32 ch;
    132         unsigned short bytesToWrite = 0;
    133         const UChar32 byteMask = 0xBF;
    134         const UChar32 byteMark = 0x80;
    135         const UChar* oldSource = source; // In case we have to back up because of target overflow.
    136         ch = static_cast<unsigned short>(*source++);
    137         // If we have a surrogate pair, convert to UChar32 first.
    138         if (ch >= 0xD800 && ch <= 0xDBFF) {
    139             // If the 16 bits following the high surrogate are in the source buffer...
    140             if (source < sourceEnd) {
    141                 UChar32 ch2 = static_cast<unsigned short>(*source);
    142                 // If it's a low surrogate, convert to UChar32.
    143                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
    144                     ch = ((ch - 0xD800) << 10) + (ch2 - 0xDC00) + 0x0010000;
    145                     ++source;
    146                 } else if (strict) { // it's an unpaired high surrogate
    147                     --source; // return to the illegal value itself
    148                     result = sourceIllegal;
    149                     break;
    150                 }
    151             } else { // We don't have the 16 bits following the high surrogate.
    152                 --source; // return to the high surrogate
    153                 result = sourceExhausted;
    154                 break;
    155             }
    156         } else if (strict) {
    157             // UTF-16 surrogate values are illegal in UTF-32
    158             if (ch >= 0xDC00 && ch <= 0xDFFF) {
    159                 --source; // return to the illegal value itself
    160                 result = sourceIllegal;
    161                 break;
    162             }
    163         }
    164         // Figure out how many bytes the result will require
    165         if (ch < (UChar32)0x80) {
    166             bytesToWrite = 1;
    167         } else if (ch < (UChar32)0x800) {
    168             bytesToWrite = 2;
    169         } else if (ch < (UChar32)0x10000) {
    170             bytesToWrite = 3;
    171         } else if (ch < (UChar32)0x110000) {
    172             bytesToWrite = 4;
    173         } else {
    174             bytesToWrite = 3;
    175             ch = 0xFFFD;
    176         }
    177 
    178         target += bytesToWrite;
    179         if (target > targetEnd) {
    180             source = oldSource; // Back up source pointer!
    181             target -= bytesToWrite;
    182             result = targetExhausted;
    183             break;
    184         }
    185         switch (bytesToWrite) { // note: everything falls through.
    186             case 4: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
    187             case 3: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
    188             case 2: *--target = (char)((ch | byteMark) & byteMask); ch >>= 6;
    189             case 1: *--target =  (char)(ch | firstByteMark[bytesToWrite]);
    190         }
    191         target += bytesToWrite;
    192     }
    193     *sourceStart = source;
    194     *targetStart = target;
    195     return result;
    196 }
    197 
    198 // This must be called with the length pre-determined by the first byte.
    199 // If presented with a length > 4, this returns false.  The Unicode
    200 // definition of UTF-8 goes up to 4-byte sequences.
    201 static bool isLegalUTF8(const unsigned char* source, int length)
    202 {
    203     unsigned char a;
    204     const unsigned char* srcptr = source + length;
    205     switch (length) {
    206         default: return false;
    207         // Everything else falls through when "true"...
    208         case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    209         case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
    210         case 2: if ((a = (*--srcptr)) > 0xBF) return false;
    211 
    212         switch (*source) {
    213             // no fall-through in this inner switch
    214             case 0xE0: if (a < 0xA0) return false; break;
    215             case 0xED: if (a > 0x9F) return false; break;
    216             case 0xF0: if (a < 0x90) return false; break;
    217             case 0xF4: if (a > 0x8F) return false; break;
    218             default:   if (a < 0x80) return false;
    219         }
    220 
    221         case 1: if (*source >= 0x80 && *source < 0xC2) return false;
    222     }
    223     if (*source > 0xF4)
    224         return false;
    225     return true;
    226 }
    227 
    228 // Magic values subtracted from a buffer value during UTF8 conversion.
    229 // This table contains as many values as there might be trailing bytes
    230 // in a UTF-8 sequence.
    231 static const UChar32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
    232             0x03C82080UL, 0xFA082080UL, 0x82082080UL };
    233 
    234 ConversionResult convertUTF8ToUTF16(
    235     const char** sourceStart, const char* sourceEnd,
    236     UChar** targetStart, UChar* targetEnd, bool strict)
    237 {
    238     ConversionResult result = conversionOK;
    239     const char* source = *sourceStart;
    240     UChar* target = *targetStart;
    241     while (source < sourceEnd) {
    242         UChar32 ch = 0;
    243         int extraBytesToRead = UTF8SequenceLength(*source) - 1;
    244         if (source + extraBytesToRead >= sourceEnd) {
    245             result = sourceExhausted;
    246             break;
    247         }
    248         // Do this check whether lenient or strict
    249         if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(source), extraBytesToRead + 1)) {
    250             result = sourceIllegal;
    251             break;
    252         }
    253         // The cases all fall through.
    254         switch (extraBytesToRead) {
    255             case 5: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
    256             case 4: ch += static_cast<unsigned char>(*source++); ch <<= 6; // remember, illegal UTF-8
    257             case 3: ch += static_cast<unsigned char>(*source++); ch <<= 6;
    258             case 2: ch += static_cast<unsigned char>(*source++); ch <<= 6;
    259             case 1: ch += static_cast<unsigned char>(*source++); ch <<= 6;
    260             case 0: ch += static_cast<unsigned char>(*source++);
    261         }
    262         ch -= offsetsFromUTF8[extraBytesToRead];
    263 
    264         if (target >= targetEnd) {
    265             source -= (extraBytesToRead + 1); // Back up source pointer!
    266             result = targetExhausted; break;
    267         }
    268         if (ch <= 0xFFFF) {
    269             // UTF-16 surrogate values are illegal in UTF-32
    270             if (ch >= 0xD800 && ch <= 0xDFFF) {
    271                 if (strict) {
    272                     source -= (extraBytesToRead + 1); // return to the illegal value itself
    273                     result = sourceIllegal;
    274                     break;
    275                 } else
    276                     *target++ = 0xFFFD;
    277             } else
    278                 *target++ = (UChar)ch; // normal case
    279         } else if (ch > 0x10FFFF) {
    280             if (strict) {
    281                 result = sourceIllegal;
    282                 source -= (extraBytesToRead + 1); // return to the start
    283                 break; // Bail out; shouldn't continue
    284             } else
    285                 *target++ = 0xFFFD;
    286         } else {
    287             // target is a character in range 0xFFFF - 0x10FFFF
    288             if (target + 1 >= targetEnd) {
    289                 source -= (extraBytesToRead + 1); // Back up source pointer!
    290                 result = targetExhausted;
    291                 break;
    292             }
    293             ch -= 0x0010000UL;
    294             *target++ = (UChar)((ch >> 10) + 0xD800);
    295             *target++ = (UChar)((ch & 0x03FF) + 0xDC00);
    296         }
    297     }
    298     *sourceStart = source;
    299     *targetStart = target;
    300     return result;
    301 }
    302 
    303 }
    304 }
    305