Home | History | Annotate | Download | only in common
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2002-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   file name:  ucnv_u8.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002jul01
     14 *   created by: Markus W. Scherer
     15 *
     16 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.
     17 *
     18 *   Also, CESU-8 implementation, see UTR 26.
     19 *   The CESU-8 converter uses all the same functions as the
     20 *   UTF-8 converter, with a branch for converting supplementary code points.
     21 */
     22 
     23 #include "unicode/utypes.h"
     24 
     25 #if !UCONFIG_NO_CONVERSION
     26 
     27 #include "unicode/ucnv.h"
     28 #include "unicode/utf.h"
     29 #include "unicode/utf8.h"
     30 #include "unicode/utf16.h"
     31 #include "ucnv_bld.h"
     32 #include "ucnv_cnv.h"
     33 #include "cmemory.h"
     34 
     35 /* Prototypes --------------------------------------------------------------- */
     36 
     37 /* Keep these here to make finicky compilers happy */
     38 
     39 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
     40                                            UErrorCode *err);
     41 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
     42                                                         UErrorCode *err);
     43 
     44 
     45 /* UTF-8 -------------------------------------------------------------------- */
     46 
     47 /* UTF-8 Conversion DATA
     48  *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
     49  */
     50 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
     51 #define MAXIMUM_UCS2            0x0000FFFF
     52 #define MAXIMUM_UTF             0x0010FFFF
     53 #define MAXIMUM_UCS4            0x7FFFFFFF
     54 #define HALF_SHIFT              10
     55 #define HALF_BASE               0x0010000
     56 #define HALF_MASK               0x3FF
     57 #define SURROGATE_HIGH_START    0xD800
     58 #define SURROGATE_HIGH_END      0xDBFF
     59 #define SURROGATE_LOW_START     0xDC00
     60 #define SURROGATE_LOW_END       0xDFFF
     61 
     62 /* -SURROGATE_LOW_START + HALF_BASE */
     63 #define SURROGATE_LOW_BASE      9216
     64 
     65 static const uint32_t offsetsFromUTF8[7] = {0,
     66   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
     67   (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
     68 };
     69 
     70 /* END OF UTF-8 Conversion DATA */
     71 
     72 static const int8_t bytesFromUTF8[256] = {
     73   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     74   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     75   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     76   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     77   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     78   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     79   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     80   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
     81 };
     82 
     83 /*
     84  * Starting with Unicode 3.0.1:
     85  * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
     86  * byte sequences with more than 4 bytes are illegal in UTF-8,
     87  * which is tested with impossible values for them
     88  */
     89 static const uint32_t
     90 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
     91 
     92 static UBool hasCESU8Data(const UConverter *cnv)
     93 {
     94 #if UCONFIG_ONLY_HTML_CONVERSION
     95     return FALSE;
     96 #else
     97     return (UBool)(cnv->sharedData == &_CESU8Data);
     98 #endif
     99 }
    100 
    101 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
    102                                   UErrorCode * err)
    103 {
    104     UConverter *cnv = args->converter;
    105     const unsigned char *mySource = (unsigned char *) args->source;
    106     UChar *myTarget = args->target;
    107     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    108     const UChar *targetLimit = args->targetLimit;
    109     unsigned char *toUBytes = cnv->toUBytes;
    110     UBool isCESU8 = hasCESU8Data(cnv);
    111     uint32_t ch, ch2 = 0;
    112     int32_t i, inBytes;
    113 
    114     /* Restore size of current sequence */
    115     if (cnv->toUnicodeStatus && myTarget < targetLimit)
    116     {
    117         inBytes = cnv->mode;            /* restore # of bytes to consume */
    118         i = cnv->toULength;             /* restore # of bytes consumed */
    119         cnv->toULength = 0;
    120 
    121         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
    122         cnv->toUnicodeStatus = 0;
    123         goto morebytes;
    124     }
    125 
    126 
    127     while (mySource < sourceLimit && myTarget < targetLimit)
    128     {
    129         ch = *(mySource++);
    130         if (ch < 0x80)        /* Simple case */
    131         {
    132             *(myTarget++) = (UChar) ch;
    133         }
    134         else
    135         {
    136             /* store the first char */
    137             toUBytes[0] = (char)ch;
    138             inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
    139             i = 1;
    140 
    141 morebytes:
    142             while (i < inBytes)
    143             {
    144                 if (mySource < sourceLimit)
    145                 {
    146                     toUBytes[i] = (char) (ch2 = *mySource);
    147                     if (!U8_IS_TRAIL(ch2))
    148                     {
    149                         break; /* i < inBytes */
    150                     }
    151                     ch = (ch << 6) + ch2;
    152                     ++mySource;
    153                     i++;
    154                 }
    155                 else
    156                 {
    157                     /* stores a partially calculated target*/
    158                     cnv->toUnicodeStatus = ch;
    159                     cnv->mode = inBytes;
    160                     cnv->toULength = (int8_t) i;
    161                     goto donefornow;
    162                 }
    163             }
    164 
    165             /* Remove the accumulated high bits */
    166             ch -= offsetsFromUTF8[inBytes];
    167 
    168             /*
    169              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
    170              * - use only trail bytes after a lead byte (checked above)
    171              * - use the right number of trail bytes for a given lead byte
    172              * - encode a code point <= U+10ffff
    173              * - use the fewest possible number of bytes for their code points
    174              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
    175              *
    176              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
    177              * There are no irregular sequences any more.
    178              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
    179              */
    180             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
    181                 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
    182             {
    183                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    184                 if (ch <= MAXIMUM_UCS2)
    185                 {
    186                     /* fits in 16 bits */
    187                     *(myTarget++) = (UChar) ch;
    188                 }
    189                 else
    190                 {
    191                     /* write out the surrogates */
    192                     ch -= HALF_BASE;
    193                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
    194                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
    195                     if (myTarget < targetLimit)
    196                     {
    197                         *(myTarget++) = (UChar)ch;
    198                     }
    199                     else
    200                     {
    201                         /* Put in overflow buffer (not handled here) */
    202                         cnv->UCharErrorBuffer[0] = (UChar) ch;
    203                         cnv->UCharErrorBufferLength = 1;
    204                         *err = U_BUFFER_OVERFLOW_ERROR;
    205                         break;
    206                     }
    207                 }
    208             }
    209             else
    210             {
    211                 cnv->toULength = (int8_t)i;
    212                 *err = U_ILLEGAL_CHAR_FOUND;
    213                 break;
    214             }
    215         }
    216     }
    217 
    218 donefornow:
    219     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    220     {
    221         /* End of target buffer */
    222         *err = U_BUFFER_OVERFLOW_ERROR;
    223     }
    224 
    225     args->target = myTarget;
    226     args->source = (const char *) mySource;
    227 }
    228 
    229 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
    230                                                 UErrorCode * err)
    231 {
    232     UConverter *cnv = args->converter;
    233     const unsigned char *mySource = (unsigned char *) args->source;
    234     UChar *myTarget = args->target;
    235     int32_t *myOffsets = args->offsets;
    236     int32_t offsetNum = 0;
    237     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    238     const UChar *targetLimit = args->targetLimit;
    239     unsigned char *toUBytes = cnv->toUBytes;
    240     UBool isCESU8 = hasCESU8Data(cnv);
    241     uint32_t ch, ch2 = 0;
    242     int32_t i, inBytes;
    243 
    244     /* Restore size of current sequence */
    245     if (cnv->toUnicodeStatus && myTarget < targetLimit)
    246     {
    247         inBytes = cnv->mode;            /* restore # of bytes to consume */
    248         i = cnv->toULength;             /* restore # of bytes consumed */
    249         cnv->toULength = 0;
    250 
    251         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
    252         cnv->toUnicodeStatus = 0;
    253         goto morebytes;
    254     }
    255 
    256     while (mySource < sourceLimit && myTarget < targetLimit)
    257     {
    258         ch = *(mySource++);
    259         if (ch < 0x80)        /* Simple case */
    260         {
    261             *(myTarget++) = (UChar) ch;
    262             *(myOffsets++) = offsetNum++;
    263         }
    264         else
    265         {
    266             toUBytes[0] = (char)ch;
    267             inBytes = bytesFromUTF8[ch];
    268             i = 1;
    269 
    270 morebytes:
    271             while (i < inBytes)
    272             {
    273                 if (mySource < sourceLimit)
    274                 {
    275                     toUBytes[i] = (char) (ch2 = *mySource);
    276                     if (!U8_IS_TRAIL(ch2))
    277                     {
    278                         break; /* i < inBytes */
    279                     }
    280                     ch = (ch << 6) + ch2;
    281                     ++mySource;
    282                     i++;
    283                 }
    284                 else
    285                 {
    286                     cnv->toUnicodeStatus = ch;
    287                     cnv->mode = inBytes;
    288                     cnv->toULength = (int8_t)i;
    289                     goto donefornow;
    290                 }
    291             }
    292 
    293             /* Remove the accumulated high bits */
    294             ch -= offsetsFromUTF8[inBytes];
    295 
    296             /*
    297              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
    298              * - use only trail bytes after a lead byte (checked above)
    299              * - use the right number of trail bytes for a given lead byte
    300              * - encode a code point <= U+10ffff
    301              * - use the fewest possible number of bytes for their code points
    302              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
    303              *
    304              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
    305              * There are no irregular sequences any more.
    306              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
    307              */
    308             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
    309                 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch)))
    310             {
    311                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    312                 if (ch <= MAXIMUM_UCS2)
    313                 {
    314                     /* fits in 16 bits */
    315                     *(myTarget++) = (UChar) ch;
    316                     *(myOffsets++) = offsetNum;
    317                 }
    318                 else
    319                 {
    320                     /* write out the surrogates */
    321                     ch -= HALF_BASE;
    322                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
    323                     *(myOffsets++) = offsetNum;
    324                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
    325                     if (myTarget < targetLimit)
    326                     {
    327                         *(myTarget++) = (UChar)ch;
    328                         *(myOffsets++) = offsetNum;
    329                     }
    330                     else
    331                     {
    332                         cnv->UCharErrorBuffer[0] = (UChar) ch;
    333                         cnv->UCharErrorBufferLength = 1;
    334                         *err = U_BUFFER_OVERFLOW_ERROR;
    335                     }
    336                 }
    337                 offsetNum += i;
    338             }
    339             else
    340             {
    341                 cnv->toULength = (int8_t)i;
    342                 *err = U_ILLEGAL_CHAR_FOUND;
    343                 break;
    344             }
    345         }
    346     }
    347 
    348 donefornow:
    349     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    350     {   /* End of target buffer */
    351         *err = U_BUFFER_OVERFLOW_ERROR;
    352     }
    353 
    354     args->target = myTarget;
    355     args->source = (const char *) mySource;
    356     args->offsets = myOffsets;
    357 }
    358 
    359 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
    360                                     UErrorCode * err)
    361 {
    362     UConverter *cnv = args->converter;
    363     const UChar *mySource = args->source;
    364     const UChar *sourceLimit = args->sourceLimit;
    365     uint8_t *myTarget = (uint8_t *) args->target;
    366     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
    367     uint8_t *tempPtr;
    368     UChar32 ch;
    369     uint8_t tempBuf[4];
    370     int32_t indexToWrite;
    371     UBool isNotCESU8 = !hasCESU8Data(cnv);
    372 
    373     if (cnv->fromUChar32 && myTarget < targetLimit)
    374     {
    375         ch = cnv->fromUChar32;
    376         cnv->fromUChar32 = 0;
    377         goto lowsurrogate;
    378     }
    379 
    380     while (mySource < sourceLimit && myTarget < targetLimit)
    381     {
    382         ch = *(mySource++);
    383 
    384         if (ch < 0x80)        /* Single byte */
    385         {
    386             *(myTarget++) = (uint8_t) ch;
    387         }
    388         else if (ch < 0x800)  /* Double byte */
    389         {
    390             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
    391             if (myTarget < targetLimit)
    392             {
    393                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
    394             }
    395             else
    396             {
    397                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
    398                 cnv->charErrorBufferLength = 1;
    399                 *err = U_BUFFER_OVERFLOW_ERROR;
    400             }
    401         }
    402         else {
    403             /* Check for surrogates */
    404             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
    405 lowsurrogate:
    406                 if (mySource < sourceLimit) {
    407                     /* test both code units */
    408                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
    409                         /* convert and consume this supplementary code point */
    410                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
    411                         ++mySource;
    412                         /* exit this condition tree */
    413                     }
    414                     else {
    415                         /* this is an unpaired trail or lead code unit */
    416                         /* callback(illegal) */
    417                         cnv->fromUChar32 = ch;
    418                         *err = U_ILLEGAL_CHAR_FOUND;
    419                         break;
    420                     }
    421                 }
    422                 else {
    423                     /* no more input */
    424                     cnv->fromUChar32 = ch;
    425                     break;
    426                 }
    427             }
    428 
    429             /* Do we write the buffer directly for speed,
    430             or do we have to be careful about target buffer space? */
    431             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
    432 
    433             if (ch <= MAXIMUM_UCS2) {
    434                 indexToWrite = 2;
    435                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
    436             }
    437             else {
    438                 indexToWrite = 3;
    439                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
    440                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
    441             }
    442             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
    443             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
    444 
    445             if (tempPtr == myTarget) {
    446                 /* There was enough space to write the codepoint directly. */
    447                 myTarget += (indexToWrite + 1);
    448             }
    449             else {
    450                 /* We might run out of room soon. Write it slowly. */
    451                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
    452                     if (myTarget < targetLimit) {
    453                         *(myTarget++) = *tempPtr;
    454                     }
    455                     else {
    456                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
    457                         *err = U_BUFFER_OVERFLOW_ERROR;
    458                     }
    459                 }
    460             }
    461         }
    462     }
    463 
    464     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    465     {
    466         *err = U_BUFFER_OVERFLOW_ERROR;
    467     }
    468 
    469     args->target = (char *) myTarget;
    470     args->source = mySource;
    471 }
    472 
    473 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
    474                                                   UErrorCode * err)
    475 {
    476     UConverter *cnv = args->converter;
    477     const UChar *mySource = args->source;
    478     int32_t *myOffsets = args->offsets;
    479     const UChar *sourceLimit = args->sourceLimit;
    480     uint8_t *myTarget = (uint8_t *) args->target;
    481     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
    482     uint8_t *tempPtr;
    483     UChar32 ch;
    484     int32_t offsetNum, nextSourceIndex;
    485     int32_t indexToWrite;
    486     uint8_t tempBuf[4];
    487     UBool isNotCESU8 = !hasCESU8Data(cnv);
    488 
    489     if (cnv->fromUChar32 && myTarget < targetLimit)
    490     {
    491         ch = cnv->fromUChar32;
    492         cnv->fromUChar32 = 0;
    493         offsetNum = -1;
    494         nextSourceIndex = 0;
    495         goto lowsurrogate;
    496     } else {
    497         offsetNum = 0;
    498     }
    499 
    500     while (mySource < sourceLimit && myTarget < targetLimit)
    501     {
    502         ch = *(mySource++);
    503 
    504         if (ch < 0x80)        /* Single byte */
    505         {
    506             *(myOffsets++) = offsetNum++;
    507             *(myTarget++) = (char) ch;
    508         }
    509         else if (ch < 0x800)  /* Double byte */
    510         {
    511             *(myOffsets++) = offsetNum;
    512             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
    513             if (myTarget < targetLimit)
    514             {
    515                 *(myOffsets++) = offsetNum++;
    516                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
    517             }
    518             else
    519             {
    520                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
    521                 cnv->charErrorBufferLength = 1;
    522                 *err = U_BUFFER_OVERFLOW_ERROR;
    523             }
    524         }
    525         else
    526         /* Check for surrogates */
    527         {
    528             nextSourceIndex = offsetNum + 1;
    529 
    530             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
    531 lowsurrogate:
    532                 if (mySource < sourceLimit) {
    533                     /* test both code units */
    534                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
    535                         /* convert and consume this supplementary code point */
    536                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
    537                         ++mySource;
    538                         ++nextSourceIndex;
    539                         /* exit this condition tree */
    540                     }
    541                     else {
    542                         /* this is an unpaired trail or lead code unit */
    543                         /* callback(illegal) */
    544                         cnv->fromUChar32 = ch;
    545                         *err = U_ILLEGAL_CHAR_FOUND;
    546                         break;
    547                     }
    548                 }
    549                 else {
    550                     /* no more input */
    551                     cnv->fromUChar32 = ch;
    552                     break;
    553                 }
    554             }
    555 
    556             /* Do we write the buffer directly for speed,
    557             or do we have to be careful about target buffer space? */
    558             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
    559 
    560             if (ch <= MAXIMUM_UCS2) {
    561                 indexToWrite = 2;
    562                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
    563             }
    564             else {
    565                 indexToWrite = 3;
    566                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
    567                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
    568             }
    569             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
    570             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
    571 
    572             if (tempPtr == myTarget) {
    573                 /* There was enough space to write the codepoint directly. */
    574                 myTarget += (indexToWrite + 1);
    575                 myOffsets[0] = offsetNum;
    576                 myOffsets[1] = offsetNum;
    577                 myOffsets[2] = offsetNum;
    578                 if (indexToWrite >= 3) {
    579                     myOffsets[3] = offsetNum;
    580                 }
    581                 myOffsets += (indexToWrite + 1);
    582             }
    583             else {
    584                 /* We might run out of room soon. Write it slowly. */
    585                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
    586                     if (myTarget < targetLimit)
    587                     {
    588                         *(myOffsets++) = offsetNum;
    589                         *(myTarget++) = *tempPtr;
    590                     }
    591                     else
    592                     {
    593                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
    594                         *err = U_BUFFER_OVERFLOW_ERROR;
    595                     }
    596                 }
    597             }
    598             offsetNum = nextSourceIndex;
    599         }
    600     }
    601 
    602     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    603     {
    604         *err = U_BUFFER_OVERFLOW_ERROR;
    605     }
    606 
    607     args->target = (char *) myTarget;
    608     args->source = mySource;
    609     args->offsets = myOffsets;
    610 }
    611 
    612 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
    613                                                UErrorCode *err) {
    614     UConverter *cnv;
    615     const uint8_t *sourceInitial;
    616     const uint8_t *source;
    617     uint16_t extraBytesToWrite;
    618     uint8_t myByte;
    619     UChar32 ch;
    620     int8_t i, isLegalSequence;
    621 
    622     /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
    623 
    624     cnv = args->converter;
    625     sourceInitial = source = (const uint8_t *)args->source;
    626     if (source >= (const uint8_t *)args->sourceLimit)
    627     {
    628         /* no input */
    629         *err = U_INDEX_OUTOFBOUNDS_ERROR;
    630         return 0xffff;
    631     }
    632 
    633     myByte = (uint8_t)*(source++);
    634     if (myByte < 0x80)
    635     {
    636         args->source = (const char *)source;
    637         return (UChar32)myByte;
    638     }
    639 
    640     extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
    641     if (extraBytesToWrite == 0) {
    642         cnv->toUBytes[0] = myByte;
    643         cnv->toULength = 1;
    644         *err = U_ILLEGAL_CHAR_FOUND;
    645         args->source = (const char *)source;
    646         return 0xffff;
    647     }
    648 
    649     /*The byte sequence is longer than the buffer area passed*/
    650     if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
    651     {
    652         /* check if all of the remaining bytes are trail bytes */
    653         cnv->toUBytes[0] = myByte;
    654         i = 1;
    655         *err = U_TRUNCATED_CHAR_FOUND;
    656         while(source < (const uint8_t *)args->sourceLimit) {
    657             if(U8_IS_TRAIL(myByte = *source)) {
    658                 cnv->toUBytes[i++] = myByte;
    659                 ++source;
    660             } else {
    661                 /* error even before we run out of input */
    662                 *err = U_ILLEGAL_CHAR_FOUND;
    663                 break;
    664             }
    665         }
    666         cnv->toULength = i;
    667         args->source = (const char *)source;
    668         return 0xffff;
    669     }
    670 
    671     isLegalSequence = 1;
    672     ch = myByte << 6;
    673     switch(extraBytesToWrite)
    674     {
    675       /* note: code falls through cases! (sic)*/
    676     case 6:
    677         ch += (myByte = *source);
    678         ch <<= 6;
    679         if (!U8_IS_TRAIL(myByte))
    680         {
    681             isLegalSequence = 0;
    682             break;
    683         }
    684         ++source;
    685         U_FALLTHROUGH;
    686     case 5:
    687         ch += (myByte = *source);
    688         ch <<= 6;
    689         if (!U8_IS_TRAIL(myByte))
    690         {
    691             isLegalSequence = 0;
    692             break;
    693         }
    694         ++source;
    695         U_FALLTHROUGH;
    696     case 4:
    697         ch += (myByte = *source);
    698         ch <<= 6;
    699         if (!U8_IS_TRAIL(myByte))
    700         {
    701             isLegalSequence = 0;
    702             break;
    703         }
    704         ++source;
    705         U_FALLTHROUGH;
    706     case 3:
    707         ch += (myByte = *source);
    708         ch <<= 6;
    709         if (!U8_IS_TRAIL(myByte))
    710         {
    711             isLegalSequence = 0;
    712             break;
    713         }
    714         ++source;
    715         U_FALLTHROUGH;
    716     case 2:
    717         ch += (myByte = *source);
    718         if (!U8_IS_TRAIL(myByte))
    719         {
    720             isLegalSequence = 0;
    721             break;
    722         }
    723         ++source;
    724     };
    725     ch -= offsetsFromUTF8[extraBytesToWrite];
    726     args->source = (const char *)source;
    727 
    728     /*
    729      * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
    730      * - use only trail bytes after a lead byte (checked above)
    731      * - use the right number of trail bytes for a given lead byte
    732      * - encode a code point <= U+10ffff
    733      * - use the fewest possible number of bytes for their code points
    734      * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
    735      *
    736      * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
    737      * There are no irregular sequences any more.
    738      */
    739     if (isLegalSequence &&
    740         (uint32_t)ch <= MAXIMUM_UTF &&
    741         (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
    742         !U_IS_SURROGATE(ch)
    743     ) {
    744         return ch; /* return the code point */
    745     }
    746 
    747     for(i = 0; sourceInitial < source; ++i) {
    748         cnv->toUBytes[i] = *sourceInitial++;
    749     }
    750     cnv->toULength = i;
    751     *err = U_ILLEGAL_CHAR_FOUND;
    752     return 0xffff;
    753 }
    754 
    755 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
    756 
    757 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
    758 static const UChar32
    759 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
    760 
    761 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
    762 static const UChar32
    763 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
    764 
    765 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
    766 static void
    767 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
    768                   UConverterToUnicodeArgs *pToUArgs,
    769                   UErrorCode *pErrorCode) {
    770     UConverter *utf8;
    771     const uint8_t *source, *sourceLimit;
    772     uint8_t *target;
    773     int32_t targetCapacity;
    774     int32_t count;
    775 
    776     int8_t oldToULength, toULength, toULimit;
    777 
    778     UChar32 c;
    779     uint8_t b, t1, t2;
    780 
    781     /* set up the local pointers */
    782     utf8=pToUArgs->converter;
    783     source=(uint8_t *)pToUArgs->source;
    784     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
    785     target=(uint8_t *)pFromUArgs->target;
    786     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
    787 
    788     /* get the converter state from the UTF-8 UConverter */
    789     c=(UChar32)utf8->toUnicodeStatus;
    790     if(c!=0) {
    791         toULength=oldToULength=utf8->toULength;
    792         toULimit=(int8_t)utf8->mode;
    793     } else {
    794         toULength=oldToULength=toULimit=0;
    795     }
    796 
    797     count=(int32_t)(sourceLimit-source)+oldToULength;
    798     if(count<toULimit) {
    799         /*
    800          * Not enough input to complete the partial character.
    801          * Jump to moreBytes below - it will not output to target.
    802          */
    803     } else if(targetCapacity<toULimit) {
    804         /*
    805          * Not enough target capacity to output the partial character.
    806          * Let the standard converter handle this.
    807          */
    808         *pErrorCode=U_USING_DEFAULT_WARNING;
    809         return;
    810     } else {
    811         /*
    812          * Use a single counter for source and target, counting the minimum of
    813          * the source length and the target capacity.
    814          * As a result, the source length is checked only once per multi-byte
    815          * character instead of twice.
    816          *
    817          * Make sure that the last byte sequence is complete, or else
    818          * stop just before it.
    819          * (The longest legal byte sequence has 3 trail bytes.)
    820          * Count oldToULength (number of source bytes from a previous buffer)
    821          * into the source length but reduce the source index by toULimit
    822          * while going back over trail bytes in order to not go back into
    823          * the bytes that will be read for finishing a partial
    824          * sequence from the previous buffer.
    825          * Let the standard converter handle edge cases.
    826          */
    827         int32_t i;
    828 
    829         if(count>targetCapacity) {
    830             count=targetCapacity;
    831         }
    832 
    833         i=0;
    834         while(i<3 && i<(count-toULimit)) {
    835             b=source[count-oldToULength-i-1];
    836             if(U8_IS_TRAIL(b)) {
    837                 ++i;
    838             } else {
    839                 if(i<U8_COUNT_TRAIL_BYTES(b)) {
    840                     /* stop converting before the lead byte if there are not enough trail bytes for it */
    841                     count-=i+1;
    842                 }
    843                 break;
    844             }
    845         }
    846     }
    847 
    848     if(c!=0) {
    849         utf8->toUnicodeStatus=0;
    850         utf8->toULength=0;
    851         goto moreBytes;
    852         /* See note in ucnv_SBCSFromUTF8() about this goto. */
    853     }
    854 
    855     /* conversion loop */
    856     while(count>0) {
    857         b=*source++;
    858         if((int8_t)b>=0) {
    859             /* convert ASCII */
    860             *target++=b;
    861             --count;
    862             continue;
    863         } else {
    864             if(b>0xe0) {
    865                 if( /* handle U+1000..U+D7FF inline */
    866                     (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
    867                                                (b==0xed && (t1 <= 0x9f))) &&
    868                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
    869                 ) {
    870                     source+=2;
    871                     *target++=b;
    872                     *target++=t1;
    873                     *target++=t2;
    874                     count-=3;
    875                     continue;
    876                 }
    877             } else if(b<0xe0) {
    878                 if( /* handle U+0080..U+07FF inline */
    879                     b>=0xc2 &&
    880                     (t1=*source) >= 0x80 && t1 <= 0xbf
    881                 ) {
    882                     ++source;
    883                     *target++=b;
    884                     *target++=t1;
    885                     count-=2;
    886                     continue;
    887                 }
    888             } else if(b==0xe0) {
    889                 if( /* handle U+0800..U+0FFF inline */
    890                     (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
    891                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
    892                 ) {
    893                     source+=2;
    894                     *target++=b;
    895                     *target++=t1;
    896                     *target++=t2;
    897                     count-=3;
    898                     continue;
    899                 }
    900             }
    901 
    902             /* handle "complicated" and error cases, and continuing partial characters */
    903             oldToULength=0;
    904             toULength=1;
    905             toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
    906             c=b;
    907 moreBytes:
    908             while(toULength<toULimit) {
    909                 if(source<sourceLimit) {
    910                     b=*source;
    911                     if(U8_IS_TRAIL(b)) {
    912                         ++source;
    913                         ++toULength;
    914                         c=(c<<6)+b;
    915                     } else {
    916                         break; /* sequence too short, stop with toULength<toULimit */
    917                     }
    918                 } else {
    919                     /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
    920                     source-=(toULength-oldToULength);
    921                     while(oldToULength<toULength) {
    922                         utf8->toUBytes[oldToULength++]=*source++;
    923                     }
    924                     utf8->toUnicodeStatus=c;
    925                     utf8->toULength=toULength;
    926                     utf8->mode=toULimit;
    927                     pToUArgs->source=(char *)source;
    928                     pFromUArgs->target=(char *)target;
    929                     return;
    930                 }
    931             }
    932 
    933             if( toULength==toULimit &&      /* consumed all trail bytes */
    934                 (toULength==3 || toULength==2) &&             /* BMP */
    935                 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
    936                 (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
    937             ) {
    938                 /* legal byte sequence for BMP code point */
    939             } else if(
    940                 toULength==toULimit && toULength==4 &&
    941                 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
    942             ) {
    943                 /* legal byte sequence for supplementary code point */
    944             } else {
    945                 /* error handling: illegal UTF-8 byte sequence */
    946                 source-=(toULength-oldToULength);
    947                 while(oldToULength<toULength) {
    948                     utf8->toUBytes[oldToULength++]=*source++;
    949                 }
    950                 utf8->toULength=toULength;
    951                 pToUArgs->source=(char *)source;
    952                 pFromUArgs->target=(char *)target;
    953                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    954                 return;
    955             }
    956 
    957             /* copy the legal byte sequence to the target */
    958             {
    959                 int8_t i;
    960 
    961                 for(i=0; i<oldToULength; ++i) {
    962                     *target++=utf8->toUBytes[i];
    963                 }
    964                 source-=(toULength-oldToULength);
    965                 for(; i<toULength; ++i) {
    966                     *target++=*source++;
    967                 }
    968                 count-=toULength;
    969             }
    970         }
    971     }
    972 
    973     if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
    974         if(target==(const uint8_t *)pFromUArgs->targetLimit) {
    975             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    976         } else {
    977             b=*source;
    978             toULimit=U8_COUNT_TRAIL_BYTES(b)+1;
    979             if(toULimit>(sourceLimit-source)) {
    980                 /* collect a truncated byte sequence */
    981                 toULength=0;
    982                 c=b;
    983                 for(;;) {
    984                     utf8->toUBytes[toULength++]=b;
    985                     if(++source==sourceLimit) {
    986                         /* partial byte sequence at end of source */
    987                         utf8->toUnicodeStatus=c;
    988                         utf8->toULength=toULength;
    989                         utf8->mode=toULimit;
    990                         break;
    991                     } else if(!U8_IS_TRAIL(b=*source)) {
    992                         /* lead byte in trail byte position */
    993                         utf8->toULength=toULength;
    994                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    995                         break;
    996                     }
    997                     c=(c<<6)+b;
    998                 }
    999             } else {
   1000                 /* partial-sequence target overflow: fall back to the pivoting implementation */
   1001                 *pErrorCode=U_USING_DEFAULT_WARNING;
   1002             }
   1003         }
   1004     }
   1005 
   1006     /* write back the updated pointers */
   1007     pToUArgs->source=(char *)source;
   1008     pFromUArgs->target=(char *)target;
   1009 }
   1010 
   1011 /* UTF-8 converter data ----------------------------------------------------- */
   1012 
   1013 static const UConverterImpl _UTF8Impl={
   1014     UCNV_UTF8,
   1015 
   1016     NULL,
   1017     NULL,
   1018 
   1019     NULL,
   1020     NULL,
   1021     NULL,
   1022 
   1023     ucnv_toUnicode_UTF8,
   1024     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
   1025     ucnv_fromUnicode_UTF8,
   1026     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
   1027     ucnv_getNextUChar_UTF8,
   1028 
   1029     NULL,
   1030     NULL,
   1031     NULL,
   1032     NULL,
   1033     ucnv_getNonSurrogateUnicodeSet,
   1034 
   1035     ucnv_UTF8FromUTF8,
   1036     ucnv_UTF8FromUTF8
   1037 };
   1038 
   1039 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
   1040 static const UConverterStaticData _UTF8StaticData={
   1041     sizeof(UConverterStaticData),
   1042     "UTF-8",
   1043     1208, UCNV_IBM, UCNV_UTF8,
   1044     1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
   1045     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
   1046     0,
   1047     0,
   1048     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1049 };
   1050 
   1051 
   1052 const UConverterSharedData _UTF8Data=
   1053         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
   1054 
   1055 /* CESU-8 converter data ---------------------------------------------------- */
   1056 
   1057 static const UConverterImpl _CESU8Impl={
   1058     UCNV_CESU8,
   1059 
   1060     NULL,
   1061     NULL,
   1062 
   1063     NULL,
   1064     NULL,
   1065     NULL,
   1066 
   1067     ucnv_toUnicode_UTF8,
   1068     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
   1069     ucnv_fromUnicode_UTF8,
   1070     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
   1071     NULL,
   1072 
   1073     NULL,
   1074     NULL,
   1075     NULL,
   1076     NULL,
   1077     ucnv_getCompleteUnicodeSet,
   1078 
   1079     NULL,
   1080     NULL
   1081 };
   1082 
   1083 static const UConverterStaticData _CESU8StaticData={
   1084     sizeof(UConverterStaticData),
   1085     "CESU-8",
   1086     9400, /* CCSID for CESU-8 */
   1087     UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
   1088     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
   1089     0,
   1090     0,
   1091     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1092 };
   1093 
   1094 
   1095 const UConverterSharedData _CESU8Data=
   1096         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
   1097 
   1098 #endif
   1099