Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2002-2016, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   file name:  ucnv_u8.c
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002jul01
     14 *   created by: Markus W. Scherer
     15 *
     16 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.
     17 *
     18 *   Also, CESU-8 implementation, see UTR 26.
     19 *   The CESU-8 converter uses all the same functions as the
     20 *   UTF-8 converter, with a branch for converting supplementary code points.
     21 */
     22 
     23 #include "unicode/utypes.h"
     24 
     25 #if !UCONFIG_NO_CONVERSION
     26 
     27 #include "unicode/ucnv.h"
     28 #include "unicode/utf.h"
     29 #include "unicode/utf8.h"
     30 #include "unicode/utf16.h"
     31 #include "uassert.h"
     32 #include "ucnv_bld.h"
     33 #include "ucnv_cnv.h"
     34 #include "cmemory.h"
     35 #include "ustr_imp.h"
     36 
     37 /* Prototypes --------------------------------------------------------------- */
     38 
     39 /* Keep these here to make finicky compilers happy */
     40 
     41 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
     42                                            UErrorCode *err);
     43 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
     44                                                         UErrorCode *err);
     45 
     46 
     47 /* UTF-8 -------------------------------------------------------------------- */
     48 
     49 #define MAXIMUM_UCS2            0x0000FFFF
     50 
     51 static const uint32_t offsetsFromUTF8[5] = {0,
     52   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
     53   (uint32_t) 0x03C82080
     54 };
     55 
     56 static UBool hasCESU8Data(const UConverter *cnv)
     57 {
     58 #if UCONFIG_ONLY_HTML_CONVERSION
     59     return FALSE;
     60 #else
     61     return (UBool)(cnv->sharedData == &_CESU8Data);
     62 #endif
     63 }
     64 U_CDECL_BEGIN
     65 static void  U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
     66                                   UErrorCode * err)
     67 {
     68     UConverter *cnv = args->converter;
     69     const unsigned char *mySource = (unsigned char *) args->source;
     70     UChar *myTarget = args->target;
     71     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
     72     const UChar *targetLimit = args->targetLimit;
     73     unsigned char *toUBytes = cnv->toUBytes;
     74     UBool isCESU8 = hasCESU8Data(cnv);
     75     uint32_t ch, ch2 = 0;
     76     int32_t i, inBytes;
     77 
     78     /* Restore size of current sequence */
     79     if (cnv->toUnicodeStatus && myTarget < targetLimit)
     80     {
     81         inBytes = cnv->mode;            /* restore # of bytes to consume */
     82         i = cnv->toULength;             /* restore # of bytes consumed */
     83         cnv->toULength = 0;
     84 
     85         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
     86         cnv->toUnicodeStatus = 0;
     87         goto morebytes;
     88     }
     89 
     90 
     91     while (mySource < sourceLimit && myTarget < targetLimit)
     92     {
     93         ch = *(mySource++);
     94         if (U8_IS_SINGLE(ch))        /* Simple case */
     95         {
     96             *(myTarget++) = (UChar) ch;
     97         }
     98         else
     99         {
    100             /* store the first char */
    101             toUBytes[0] = (char)ch;
    102             inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */
    103             i = 1;
    104 
    105 morebytes:
    106             while (i < inBytes)
    107             {
    108                 if (mySource < sourceLimit)
    109                 {
    110                     toUBytes[i] = (char) (ch2 = *mySource);
    111                     if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
    112                             !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
    113                     {
    114                         break; /* i < inBytes */
    115                     }
    116                     ch = (ch << 6) + ch2;
    117                     ++mySource;
    118                     i++;
    119                 }
    120                 else
    121                 {
    122                     /* stores a partially calculated target*/
    123                     cnv->toUnicodeStatus = ch;
    124                     cnv->mode = inBytes;
    125                     cnv->toULength = (int8_t) i;
    126                     goto donefornow;
    127                 }
    128             }
    129 
    130             // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
    131             if (i == inBytes && (!isCESU8 || i <= 3))
    132             {
    133                 /* Remove the accumulated high bits */
    134                 ch -= offsetsFromUTF8[inBytes];
    135 
    136                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    137                 if (ch <= MAXIMUM_UCS2)
    138                 {
    139                     /* fits in 16 bits */
    140                     *(myTarget++) = (UChar) ch;
    141                 }
    142                 else
    143                 {
    144                     /* write out the surrogates */
    145                     *(myTarget++) = U16_LEAD(ch);
    146                     ch = U16_TRAIL(ch);
    147                     if (myTarget < targetLimit)
    148                     {
    149                         *(myTarget++) = (UChar)ch;
    150                     }
    151                     else
    152                     {
    153                         /* Put in overflow buffer (not handled here) */
    154                         cnv->UCharErrorBuffer[0] = (UChar) ch;
    155                         cnv->UCharErrorBufferLength = 1;
    156                         *err = U_BUFFER_OVERFLOW_ERROR;
    157                         break;
    158                     }
    159                 }
    160             }
    161             else
    162             {
    163                 cnv->toULength = (int8_t)i;
    164                 *err = U_ILLEGAL_CHAR_FOUND;
    165                 break;
    166             }
    167         }
    168     }
    169 
    170 donefornow:
    171     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    172     {
    173         /* End of target buffer */
    174         *err = U_BUFFER_OVERFLOW_ERROR;
    175     }
    176 
    177     args->target = myTarget;
    178     args->source = (const char *) mySource;
    179 }
    180 
    181 static void  U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
    182                                                 UErrorCode * err)
    183 {
    184     UConverter *cnv = args->converter;
    185     const unsigned char *mySource = (unsigned char *) args->source;
    186     UChar *myTarget = args->target;
    187     int32_t *myOffsets = args->offsets;
    188     int32_t offsetNum = 0;
    189     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    190     const UChar *targetLimit = args->targetLimit;
    191     unsigned char *toUBytes = cnv->toUBytes;
    192     UBool isCESU8 = hasCESU8Data(cnv);
    193     uint32_t ch, ch2 = 0;
    194     int32_t i, inBytes;
    195 
    196     /* Restore size of current sequence */
    197     if (cnv->toUnicodeStatus && myTarget < targetLimit)
    198     {
    199         inBytes = cnv->mode;            /* restore # of bytes to consume */
    200         i = cnv->toULength;             /* restore # of bytes consumed */
    201         cnv->toULength = 0;
    202 
    203         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
    204         cnv->toUnicodeStatus = 0;
    205         goto morebytes;
    206     }
    207 
    208     while (mySource < sourceLimit && myTarget < targetLimit)
    209     {
    210         ch = *(mySource++);
    211         if (U8_IS_SINGLE(ch))        /* Simple case */
    212         {
    213             *(myTarget++) = (UChar) ch;
    214             *(myOffsets++) = offsetNum++;
    215         }
    216         else
    217         {
    218             toUBytes[0] = (char)ch;
    219             inBytes = U8_COUNT_BYTES_NON_ASCII(ch);
    220             i = 1;
    221 
    222 morebytes:
    223             while (i < inBytes)
    224             {
    225                 if (mySource < sourceLimit)
    226                 {
    227                     toUBytes[i] = (char) (ch2 = *mySource);
    228                     if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) &&
    229                             !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2)))
    230                     {
    231                         break; /* i < inBytes */
    232                     }
    233                     ch = (ch << 6) + ch2;
    234                     ++mySource;
    235                     i++;
    236                 }
    237                 else
    238                 {
    239                     cnv->toUnicodeStatus = ch;
    240                     cnv->mode = inBytes;
    241                     cnv->toULength = (int8_t)i;
    242                     goto donefornow;
    243                 }
    244             }
    245 
    246             // In CESU-8, only surrogates, not supplementary code points, are encoded directly.
    247             if (i == inBytes && (!isCESU8 || i <= 3))
    248             {
    249                 /* Remove the accumulated high bits */
    250                 ch -= offsetsFromUTF8[inBytes];
    251 
    252                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    253                 if (ch <= MAXIMUM_UCS2)
    254                 {
    255                     /* fits in 16 bits */
    256                     *(myTarget++) = (UChar) ch;
    257                     *(myOffsets++) = offsetNum;
    258                 }
    259                 else
    260                 {
    261                     /* write out the surrogates */
    262                     *(myTarget++) = U16_LEAD(ch);
    263                     *(myOffsets++) = offsetNum;
    264                     ch = U16_TRAIL(ch);
    265                     if (myTarget < targetLimit)
    266                     {
    267                         *(myTarget++) = (UChar)ch;
    268                         *(myOffsets++) = offsetNum;
    269                     }
    270                     else
    271                     {
    272                         cnv->UCharErrorBuffer[0] = (UChar) ch;
    273                         cnv->UCharErrorBufferLength = 1;
    274                         *err = U_BUFFER_OVERFLOW_ERROR;
    275                     }
    276                 }
    277                 offsetNum += i;
    278             }
    279             else
    280             {
    281                 cnv->toULength = (int8_t)i;
    282                 *err = U_ILLEGAL_CHAR_FOUND;
    283                 break;
    284             }
    285         }
    286     }
    287 
    288 donefornow:
    289     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    290     {   /* End of target buffer */
    291         *err = U_BUFFER_OVERFLOW_ERROR;
    292     }
    293 
    294     args->target = myTarget;
    295     args->source = (const char *) mySource;
    296     args->offsets = myOffsets;
    297 }
    298 U_CDECL_END
    299 
    300 U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
    301                                     UErrorCode * err)
    302 {
    303     UConverter *cnv = args->converter;
    304     const UChar *mySource = args->source;
    305     const UChar *sourceLimit = args->sourceLimit;
    306     uint8_t *myTarget = (uint8_t *) args->target;
    307     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
    308     uint8_t *tempPtr;
    309     UChar32 ch;
    310     uint8_t tempBuf[4];
    311     int32_t indexToWrite;
    312     UBool isNotCESU8 = !hasCESU8Data(cnv);
    313 
    314     if (cnv->fromUChar32 && myTarget < targetLimit)
    315     {
    316         ch = cnv->fromUChar32;
    317         cnv->fromUChar32 = 0;
    318         goto lowsurrogate;
    319     }
    320 
    321     while (mySource < sourceLimit && myTarget < targetLimit)
    322     {
    323         ch = *(mySource++);
    324 
    325         if (ch < 0x80)        /* Single byte */
    326         {
    327             *(myTarget++) = (uint8_t) ch;
    328         }
    329         else if (ch < 0x800)  /* Double byte */
    330         {
    331             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
    332             if (myTarget < targetLimit)
    333             {
    334                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
    335             }
    336             else
    337             {
    338                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
    339                 cnv->charErrorBufferLength = 1;
    340                 *err = U_BUFFER_OVERFLOW_ERROR;
    341             }
    342         }
    343         else {
    344             /* Check for surrogates */
    345             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
    346 lowsurrogate:
    347                 if (mySource < sourceLimit) {
    348                     /* test both code units */
    349                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
    350                         /* convert and consume this supplementary code point */
    351                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
    352                         ++mySource;
    353                         /* exit this condition tree */
    354                     }
    355                     else {
    356                         /* this is an unpaired trail or lead code unit */
    357                         /* callback(illegal) */
    358                         cnv->fromUChar32 = ch;
    359                         *err = U_ILLEGAL_CHAR_FOUND;
    360                         break;
    361                     }
    362                 }
    363                 else {
    364                     /* no more input */
    365                     cnv->fromUChar32 = ch;
    366                     break;
    367                 }
    368             }
    369 
    370             /* Do we write the buffer directly for speed,
    371             or do we have to be careful about target buffer space? */
    372             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
    373 
    374             if (ch <= MAXIMUM_UCS2) {
    375                 indexToWrite = 2;
    376                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
    377             }
    378             else {
    379                 indexToWrite = 3;
    380                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
    381                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
    382             }
    383             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
    384             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
    385 
    386             if (tempPtr == myTarget) {
    387                 /* There was enough space to write the codepoint directly. */
    388                 myTarget += (indexToWrite + 1);
    389             }
    390             else {
    391                 /* We might run out of room soon. Write it slowly. */
    392                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
    393                     if (myTarget < targetLimit) {
    394                         *(myTarget++) = *tempPtr;
    395                     }
    396                     else {
    397                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
    398                         *err = U_BUFFER_OVERFLOW_ERROR;
    399                     }
    400                 }
    401             }
    402         }
    403     }
    404 
    405     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    406     {
    407         *err = U_BUFFER_OVERFLOW_ERROR;
    408     }
    409 
    410     args->target = (char *) myTarget;
    411     args->source = mySource;
    412 }
    413 
    414 U_CFUNC void  U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
    415                                                   UErrorCode * err)
    416 {
    417     UConverter *cnv = args->converter;
    418     const UChar *mySource = args->source;
    419     int32_t *myOffsets = args->offsets;
    420     const UChar *sourceLimit = args->sourceLimit;
    421     uint8_t *myTarget = (uint8_t *) args->target;
    422     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
    423     uint8_t *tempPtr;
    424     UChar32 ch;
    425     int32_t offsetNum, nextSourceIndex;
    426     int32_t indexToWrite;
    427     uint8_t tempBuf[4];
    428     UBool isNotCESU8 = !hasCESU8Data(cnv);
    429 
    430     if (cnv->fromUChar32 && myTarget < targetLimit)
    431     {
    432         ch = cnv->fromUChar32;
    433         cnv->fromUChar32 = 0;
    434         offsetNum = -1;
    435         nextSourceIndex = 0;
    436         goto lowsurrogate;
    437     } else {
    438         offsetNum = 0;
    439     }
    440 
    441     while (mySource < sourceLimit && myTarget < targetLimit)
    442     {
    443         ch = *(mySource++);
    444 
    445         if (ch < 0x80)        /* Single byte */
    446         {
    447             *(myOffsets++) = offsetNum++;
    448             *(myTarget++) = (char) ch;
    449         }
    450         else if (ch < 0x800)  /* Double byte */
    451         {
    452             *(myOffsets++) = offsetNum;
    453             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
    454             if (myTarget < targetLimit)
    455             {
    456                 *(myOffsets++) = offsetNum++;
    457                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
    458             }
    459             else
    460             {
    461                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
    462                 cnv->charErrorBufferLength = 1;
    463                 *err = U_BUFFER_OVERFLOW_ERROR;
    464             }
    465         }
    466         else
    467         /* Check for surrogates */
    468         {
    469             nextSourceIndex = offsetNum + 1;
    470 
    471             if(U16_IS_SURROGATE(ch) && isNotCESU8) {
    472 lowsurrogate:
    473                 if (mySource < sourceLimit) {
    474                     /* test both code units */
    475                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) {
    476                         /* convert and consume this supplementary code point */
    477                         ch=U16_GET_SUPPLEMENTARY(ch, *mySource);
    478                         ++mySource;
    479                         ++nextSourceIndex;
    480                         /* exit this condition tree */
    481                     }
    482                     else {
    483                         /* this is an unpaired trail or lead code unit */
    484                         /* callback(illegal) */
    485                         cnv->fromUChar32 = ch;
    486                         *err = U_ILLEGAL_CHAR_FOUND;
    487                         break;
    488                     }
    489                 }
    490                 else {
    491                     /* no more input */
    492                     cnv->fromUChar32 = ch;
    493                     break;
    494                 }
    495             }
    496 
    497             /* Do we write the buffer directly for speed,
    498             or do we have to be careful about target buffer space? */
    499             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
    500 
    501             if (ch <= MAXIMUM_UCS2) {
    502                 indexToWrite = 2;
    503                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
    504             }
    505             else {
    506                 indexToWrite = 3;
    507                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
    508                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
    509             }
    510             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
    511             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
    512 
    513             if (tempPtr == myTarget) {
    514                 /* There was enough space to write the codepoint directly. */
    515                 myTarget += (indexToWrite + 1);
    516                 myOffsets[0] = offsetNum;
    517                 myOffsets[1] = offsetNum;
    518                 myOffsets[2] = offsetNum;
    519                 if (indexToWrite >= 3) {
    520                     myOffsets[3] = offsetNum;
    521                 }
    522                 myOffsets += (indexToWrite + 1);
    523             }
    524             else {
    525                 /* We might run out of room soon. Write it slowly. */
    526                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
    527                     if (myTarget < targetLimit)
    528                     {
    529                         *(myOffsets++) = offsetNum;
    530                         *(myTarget++) = *tempPtr;
    531                     }
    532                     else
    533                     {
    534                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
    535                         *err = U_BUFFER_OVERFLOW_ERROR;
    536                     }
    537                 }
    538             }
    539             offsetNum = nextSourceIndex;
    540         }
    541     }
    542 
    543     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    544     {
    545         *err = U_BUFFER_OVERFLOW_ERROR;
    546     }
    547 
    548     args->target = (char *) myTarget;
    549     args->source = mySource;
    550     args->offsets = myOffsets;
    551 }
    552 
    553 U_CDECL_BEGIN
    554 static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
    555                                                UErrorCode *err) {
    556     UConverter *cnv;
    557     const uint8_t *sourceInitial;
    558     const uint8_t *source;
    559     uint8_t myByte;
    560     UChar32 ch;
    561     int8_t i;
    562 
    563     /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
    564 
    565     cnv = args->converter;
    566     sourceInitial = source = (const uint8_t *)args->source;
    567     if (source >= (const uint8_t *)args->sourceLimit)
    568     {
    569         /* no input */
    570         *err = U_INDEX_OUTOFBOUNDS_ERROR;
    571         return 0xffff;
    572     }
    573 
    574     myByte = (uint8_t)*(source++);
    575     if (U8_IS_SINGLE(myByte))
    576     {
    577         args->source = (const char *)source;
    578         return (UChar32)myByte;
    579     }
    580 
    581     uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte);
    582     if (countTrailBytes == 0) {
    583         cnv->toUBytes[0] = myByte;
    584         cnv->toULength = 1;
    585         *err = U_ILLEGAL_CHAR_FOUND;
    586         args->source = (const char *)source;
    587         return 0xffff;
    588     }
    589 
    590     /*The byte sequence is longer than the buffer area passed*/
    591     if (((const char *)source + countTrailBytes) > args->sourceLimit)
    592     {
    593         /* check if all of the remaining bytes are trail bytes */
    594         uint16_t extraBytesToWrite = countTrailBytes + 1;
    595         cnv->toUBytes[0] = myByte;
    596         i = 1;
    597         *err = U_TRUNCATED_CHAR_FOUND;
    598         while(source < (const uint8_t *)args->sourceLimit) {
    599             uint8_t b = *source;
    600             if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) {
    601                 cnv->toUBytes[i++] = b;
    602                 ++source;
    603             } else {
    604                 /* error even before we run out of input */
    605                 *err = U_ILLEGAL_CHAR_FOUND;
    606                 break;
    607             }
    608         }
    609         cnv->toULength = i;
    610         args->source = (const char *)source;
    611         return 0xffff;
    612     }
    613 
    614     ch = myByte << 6;
    615     if(countTrailBytes == 2) {
    616         uint8_t t1 = *source, t2;
    617         if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) {
    618             args->source = (const char *)(source + 1);
    619             return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3];
    620         }
    621     } else if(countTrailBytes == 1) {
    622         uint8_t t1 = *source;
    623         if(U8_IS_TRAIL(t1)) {
    624             args->source = (const char *)(source + 1);
    625             return (ch + t1) - offsetsFromUTF8[2];
    626         }
    627     } else {  // countTrailBytes == 3
    628         uint8_t t1 = *source, t2, t3;
    629         if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) &&
    630                 U8_IS_TRAIL(t3 = *++source)) {
    631             args->source = (const char *)(source + 1);
    632             return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4];
    633         }
    634     }
    635     args->source = (const char *)source;
    636 
    637     for(i = 0; sourceInitial < source; ++i) {
    638         cnv->toUBytes[i] = *sourceInitial++;
    639     }
    640     cnv->toULength = i;
    641     *err = U_ILLEGAL_CHAR_FOUND;
    642     return 0xffff;
    643 }
    644 U_CDECL_END
    645 
    646 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
    647 
    648 U_CDECL_BEGIN
    649 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
    650 static void U_CALLCONV
    651 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
    652                   UConverterToUnicodeArgs *pToUArgs,
    653                   UErrorCode *pErrorCode) {
    654     UConverter *utf8;
    655     const uint8_t *source, *sourceLimit;
    656     uint8_t *target;
    657     int32_t targetCapacity;
    658     int32_t count;
    659 
    660     int8_t oldToULength, toULength, toULimit;
    661 
    662     UChar32 c;
    663     uint8_t b, t1, t2;
    664 
    665     /* set up the local pointers */
    666     utf8=pToUArgs->converter;
    667     source=(uint8_t *)pToUArgs->source;
    668     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
    669     target=(uint8_t *)pFromUArgs->target;
    670     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
    671 
    672     /* get the converter state from the UTF-8 UConverter */
    673     c=(UChar32)utf8->toUnicodeStatus;
    674     if(c!=0) {
    675         toULength=oldToULength=utf8->toULength;
    676         toULimit=(int8_t)utf8->mode;
    677     } else {
    678         toULength=oldToULength=toULimit=0;
    679     }
    680 
    681     count=(int32_t)(sourceLimit-source)+oldToULength;
    682     if(count<toULimit) {
    683         /*
    684          * Not enough input to complete the partial character.
    685          * Jump to moreBytes below - it will not output to target.
    686          */
    687     } else if(targetCapacity<toULimit) {
    688         /*
    689          * Not enough target capacity to output the partial character.
    690          * Let the standard converter handle this.
    691          */
    692         *pErrorCode=U_USING_DEFAULT_WARNING;
    693         return;
    694     } else {
    695         // Use a single counter for source and target, counting the minimum of
    696         // the source length and the target capacity.
    697         // Let the standard converter handle edge cases.
    698         const uint8_t *limit=sourceLimit;
    699         if(count>targetCapacity) {
    700             limit-=(count-targetCapacity);
    701             count=targetCapacity;
    702         }
    703 
    704         // The conversion loop checks count>0 only once per 1/2/3-byte character.
    705         // If the buffer ends with a truncated 2- or 3-byte sequence,
    706         // then we reduce the count to stop before that,
    707         // and collect the remaining bytes after the conversion loop.
    708         {
    709             // Do not go back into the bytes that will be read for finishing a partial
    710             // sequence from the previous buffer.
    711             int32_t length=count-toULimit;
    712             if(length>0) {
    713                 uint8_t b1=*(limit-1);
    714                 if(U8_IS_SINGLE(b1)) {
    715                     // common ASCII character
    716                 } else if(U8_IS_TRAIL(b1) && length>=2) {
    717                     uint8_t b2=*(limit-2);
    718                     if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) {
    719                         // truncated 3-byte sequence
    720                         count-=2;
    721                     }
    722                 } else if(0xc2<=b1 && b1<0xf0) {
    723                     // truncated 2- or 3-byte sequence
    724                     --count;
    725                 }
    726             }
    727         }
    728     }
    729 
    730     if(c!=0) {
    731         utf8->toUnicodeStatus=0;
    732         utf8->toULength=0;
    733         goto moreBytes;
    734         /* See note in ucnv_SBCSFromUTF8() about this goto. */
    735     }
    736 
    737     /* conversion loop */
    738     while(count>0) {
    739         b=*source++;
    740         if(U8_IS_SINGLE(b)) {
    741             /* convert ASCII */
    742             *target++=b;
    743             --count;
    744             continue;
    745         } else {
    746             if(b>=0xe0) {
    747                 if( /* handle U+0800..U+FFFF inline */
    748                     b<0xf0 &&
    749                     U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) &&
    750                     U8_IS_TRAIL(t2=source[1])
    751                 ) {
    752                     source+=2;
    753                     *target++=b;
    754                     *target++=t1;
    755                     *target++=t2;
    756                     count-=3;
    757                     continue;
    758                 }
    759             } else {
    760                 if( /* handle U+0080..U+07FF inline */
    761                     b>=0xc2 &&
    762                     U8_IS_TRAIL(t1=*source)
    763                 ) {
    764                     ++source;
    765                     *target++=b;
    766                     *target++=t1;
    767                     count-=2;
    768                     continue;
    769                 }
    770             }
    771 
    772             /* handle "complicated" and error cases, and continuing partial characters */
    773             oldToULength=0;
    774             toULength=1;
    775             toULimit=U8_COUNT_BYTES_NON_ASCII(b);
    776             c=b;
    777 moreBytes:
    778             while(toULength<toULimit) {
    779                 if(source<sourceLimit) {
    780                     b=*source;
    781                     if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) {
    782                         ++source;
    783                         ++toULength;
    784                         c=(c<<6)+b;
    785                     } else {
    786                         break; /* sequence too short, stop with toULength<toULimit */
    787                     }
    788                 } else {
    789                     /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
    790                     source-=(toULength-oldToULength);
    791                     while(oldToULength<toULength) {
    792                         utf8->toUBytes[oldToULength++]=*source++;
    793                     }
    794                     utf8->toUnicodeStatus=c;
    795                     utf8->toULength=toULength;
    796                     utf8->mode=toULimit;
    797                     pToUArgs->source=(char *)source;
    798                     pFromUArgs->target=(char *)target;
    799                     return;
    800                 }
    801             }
    802 
    803             if(toULength!=toULimit) {
    804                 /* error handling: illegal UTF-8 byte sequence */
    805                 source-=(toULength-oldToULength);
    806                 while(oldToULength<toULength) {
    807                     utf8->toUBytes[oldToULength++]=*source++;
    808                 }
    809                 utf8->toULength=toULength;
    810                 pToUArgs->source=(char *)source;
    811                 pFromUArgs->target=(char *)target;
    812                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    813                 return;
    814             }
    815 
    816             /* copy the legal byte sequence to the target */
    817             if(count>=toULength) {
    818                 int8_t i;
    819 
    820                 for(i=0; i<oldToULength; ++i) {
    821                     *target++=utf8->toUBytes[i];
    822                 }
    823                 source-=(toULength-oldToULength);
    824                 for(; i<toULength; ++i) {
    825                     *target++=*source++;
    826                 }
    827                 count-=toULength;
    828             } else {
    829                 // A supplementary character that does not fit into the target.
    830                 // Let the standard converter handle this.
    831                 source-=(toULength-oldToULength);
    832                 pToUArgs->source=(char *)source;
    833                 pFromUArgs->target=(char *)target;
    834                 *pErrorCode=U_USING_DEFAULT_WARNING;
    835                 return;
    836             }
    837         }
    838     }
    839     U_ASSERT(count>=0);
    840 
    841     if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
    842         if(target==(const uint8_t *)pFromUArgs->targetLimit) {
    843             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    844         } else {
    845             b=*source;
    846             toULimit=U8_COUNT_BYTES(b);
    847             if(toULimit>(sourceLimit-source)) {
    848                 /* collect a truncated byte sequence */
    849                 toULength=0;
    850                 c=b;
    851                 for(;;) {
    852                     utf8->toUBytes[toULength++]=b;
    853                     if(++source==sourceLimit) {
    854                         /* partial byte sequence at end of source */
    855                         utf8->toUnicodeStatus=c;
    856                         utf8->toULength=toULength;
    857                         utf8->mode=toULimit;
    858                         break;
    859                     } else if(!U8_IS_TRAIL(b=*source)) {
    860                         /* lead byte in trail byte position */
    861                         utf8->toULength=toULength;
    862                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    863                         break;
    864                     }
    865                     c=(c<<6)+b;
    866                 }
    867             } else {
    868                 /* partial-sequence target overflow: fall back to the pivoting implementation */
    869                 *pErrorCode=U_USING_DEFAULT_WARNING;
    870             }
    871         }
    872     }
    873 
    874     /* write back the updated pointers */
    875     pToUArgs->source=(char *)source;
    876     pFromUArgs->target=(char *)target;
    877 }
    878 
    879 U_CDECL_END
    880 
    881 /* UTF-8 converter data ----------------------------------------------------- */
    882 
    883 static const UConverterImpl _UTF8Impl={
    884     UCNV_UTF8,
    885 
    886     NULL,
    887     NULL,
    888 
    889     NULL,
    890     NULL,
    891     NULL,
    892 
    893     ucnv_toUnicode_UTF8,
    894     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
    895     ucnv_fromUnicode_UTF8,
    896     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
    897     ucnv_getNextUChar_UTF8,
    898 
    899     NULL,
    900     NULL,
    901     NULL,
    902     NULL,
    903     ucnv_getNonSurrogateUnicodeSet,
    904 
    905     ucnv_UTF8FromUTF8,
    906     ucnv_UTF8FromUTF8
    907 };
    908 
    909 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
    910 static const UConverterStaticData _UTF8StaticData={
    911     sizeof(UConverterStaticData),
    912     "UTF-8",
    913     1208, UCNV_IBM, UCNV_UTF8,
    914     1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
    915     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
    916     0,
    917     0,
    918     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
    919 };
    920 
    921 
    922 const UConverterSharedData _UTF8Data=
    923         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl);
    924 
    925 /* CESU-8 converter data ---------------------------------------------------- */
    926 
    927 static const UConverterImpl _CESU8Impl={
    928     UCNV_CESU8,
    929 
    930     NULL,
    931     NULL,
    932 
    933     NULL,
    934     NULL,
    935     NULL,
    936 
    937     ucnv_toUnicode_UTF8,
    938     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
    939     ucnv_fromUnicode_UTF8,
    940     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
    941     NULL,
    942 
    943     NULL,
    944     NULL,
    945     NULL,
    946     NULL,
    947     ucnv_getCompleteUnicodeSet,
    948 
    949     NULL,
    950     NULL
    951 };
    952 
    953 static const UConverterStaticData _CESU8StaticData={
    954     sizeof(UConverterStaticData),
    955     "CESU-8",
    956     9400, /* CCSID for CESU-8 */
    957     UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
    958     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
    959     0,
    960     0,
    961     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
    962 };
    963 
    964 
    965 const UConverterSharedData _CESU8Data=
    966         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl);
    967 
    968 #endif
    969