Home | History | Annotate | Download | only in common
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2002-2007, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   file name:  ucnv_u8.c
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2002jul01
     12 *   created by: Markus W. Scherer
     13 *
     14 *   UTF-8 converter implementation. Used to be in ucnv_utf.c.
     15 *
     16 *   Also, CESU-8 implementation, see UTR 26.
     17 *   The CESU-8 converter uses all the same functions as the
     18 *   UTF-8 converter, with a branch for converting supplementary code points.
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 
     23 #if !UCONFIG_NO_CONVERSION
     24 
     25 #include "unicode/ucnv.h"
     26 #include "ucnv_bld.h"
     27 #include "ucnv_cnv.h"
     28 #include "cmemory.h"
     29 
     30 /* Prototypes --------------------------------------------------------------- */
     31 
     32 /* Keep these here to make finicky compilers happy */
     33 
     34 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args,
     35                                            UErrorCode *err);
     36 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args,
     37                                                         UErrorCode *err);
     38 
     39 
     40 /* UTF-8 -------------------------------------------------------------------- */
     41 
     42 /* UTF-8 Conversion DATA
     43  *   for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9
     44  */
     45 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/
     46 #define MAXIMUM_UCS2            0x0000FFFF
     47 #define MAXIMUM_UTF             0x0010FFFF
     48 #define MAXIMUM_UCS4            0x7FFFFFFF
     49 #define HALF_SHIFT              10
     50 #define HALF_BASE               0x0010000
     51 #define HALF_MASK               0x3FF
     52 #define SURROGATE_HIGH_START    0xD800
     53 #define SURROGATE_HIGH_END      0xDBFF
     54 #define SURROGATE_LOW_START     0xDC00
     55 #define SURROGATE_LOW_END       0xDFFF
     56 
     57 /* -SURROGATE_LOW_START + HALF_BASE */
     58 #define SURROGATE_LOW_BASE      9216
     59 
     60 static const uint32_t offsetsFromUTF8[7] = {0,
     61   (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080,
     62   (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080
     63 };
     64 
     65 /* END OF UTF-8 Conversion DATA */
     66 
     67 static const int8_t bytesFromUTF8[256] = {
     68   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     69   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     70   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     71   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     72   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     73   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     74   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
     75   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
     76 };
     77 
     78 /*
     79  * Starting with Unicode 3.0.1:
     80  * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N];
     81  * byte sequences with more than 4 bytes are illegal in UTF-8,
     82  * which is tested with impossible values for them
     83  */
     84 static const uint32_t
     85 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff };
     86 
     87 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args,
     88                                   UErrorCode * err)
     89 {
     90     UConverter *cnv = args->converter;
     91     const unsigned char *mySource = (unsigned char *) args->source;
     92     UChar *myTarget = args->target;
     93     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
     94     const UChar *targetLimit = args->targetLimit;
     95     unsigned char *toUBytes = cnv->toUBytes;
     96     UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
     97     uint32_t ch, ch2 = 0;
     98     int32_t i, inBytes;
     99 
    100     /* Restore size of current sequence */
    101     if (cnv->toUnicodeStatus && myTarget < targetLimit)
    102     {
    103         inBytes = cnv->mode;            /* restore # of bytes to consume */
    104         i = cnv->toULength;             /* restore # of bytes consumed */
    105         cnv->toULength = 0;
    106 
    107         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
    108         cnv->toUnicodeStatus = 0;
    109         goto morebytes;
    110     }
    111 
    112 
    113     while (mySource < sourceLimit && myTarget < targetLimit)
    114     {
    115         ch = *(mySource++);
    116         if (ch < 0x80)        /* Simple case */
    117         {
    118             *(myTarget++) = (UChar) ch;
    119         }
    120         else
    121         {
    122             /* store the first char */
    123             toUBytes[0] = (char)ch;
    124             inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */
    125             i = 1;
    126 
    127 morebytes:
    128             while (i < inBytes)
    129             {
    130                 if (mySource < sourceLimit)
    131                 {
    132                     toUBytes[i] = (char) (ch2 = *mySource);
    133                     if (!UTF8_IS_TRAIL(ch2))
    134                     {
    135                         break; /* i < inBytes */
    136                     }
    137                     ch = (ch << 6) + ch2;
    138                     ++mySource;
    139                     i++;
    140                 }
    141                 else
    142                 {
    143                     /* stores a partially calculated target*/
    144                     cnv->toUnicodeStatus = ch;
    145                     cnv->mode = inBytes;
    146                     cnv->toULength = (int8_t) i;
    147                     goto donefornow;
    148                 }
    149             }
    150 
    151             /* Remove the accumulated high bits */
    152             ch -= offsetsFromUTF8[inBytes];
    153 
    154             /*
    155              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
    156              * - use only trail bytes after a lead byte (checked above)
    157              * - use the right number of trail bytes for a given lead byte
    158              * - encode a code point <= U+10ffff
    159              * - use the fewest possible number of bytes for their code points
    160              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
    161              *
    162              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
    163              * There are no irregular sequences any more.
    164              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
    165              */
    166             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
    167                 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
    168             {
    169                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    170                 if (ch <= MAXIMUM_UCS2)
    171                 {
    172                     /* fits in 16 bits */
    173                     *(myTarget++) = (UChar) ch;
    174                 }
    175                 else
    176                 {
    177                     /* write out the surrogates */
    178                     ch -= HALF_BASE;
    179                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
    180                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
    181                     if (myTarget < targetLimit)
    182                     {
    183                         *(myTarget++) = (UChar)ch;
    184                     }
    185                     else
    186                     {
    187                         /* Put in overflow buffer (not handled here) */
    188                         cnv->UCharErrorBuffer[0] = (UChar) ch;
    189                         cnv->UCharErrorBufferLength = 1;
    190                         *err = U_BUFFER_OVERFLOW_ERROR;
    191                         break;
    192                     }
    193                 }
    194             }
    195             else
    196             {
    197                 cnv->toULength = (int8_t)i;
    198                 *err = U_ILLEGAL_CHAR_FOUND;
    199                 break;
    200             }
    201         }
    202     }
    203 
    204 donefornow:
    205     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    206     {
    207         /* End of target buffer */
    208         *err = U_BUFFER_OVERFLOW_ERROR;
    209     }
    210 
    211     args->target = myTarget;
    212     args->source = (const char *) mySource;
    213 }
    214 
    215 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args,
    216                                                 UErrorCode * err)
    217 {
    218     UConverter *cnv = args->converter;
    219     const unsigned char *mySource = (unsigned char *) args->source;
    220     UChar *myTarget = args->target;
    221     int32_t *myOffsets = args->offsets;
    222     int32_t offsetNum = 0;
    223     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    224     const UChar *targetLimit = args->targetLimit;
    225     unsigned char *toUBytes = cnv->toUBytes;
    226     UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data);
    227     uint32_t ch, ch2 = 0;
    228     int32_t i, inBytes;
    229 
    230     /* Restore size of current sequence */
    231     if (cnv->toUnicodeStatus && myTarget < targetLimit)
    232     {
    233         inBytes = cnv->mode;            /* restore # of bytes to consume */
    234         i = cnv->toULength;             /* restore # of bytes consumed */
    235         cnv->toULength = 0;
    236 
    237         ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/
    238         cnv->toUnicodeStatus = 0;
    239         goto morebytes;
    240     }
    241 
    242     while (mySource < sourceLimit && myTarget < targetLimit)
    243     {
    244         ch = *(mySource++);
    245         if (ch < 0x80)        /* Simple case */
    246         {
    247             *(myTarget++) = (UChar) ch;
    248             *(myOffsets++) = offsetNum++;
    249         }
    250         else
    251         {
    252             toUBytes[0] = (char)ch;
    253             inBytes = bytesFromUTF8[ch];
    254             i = 1;
    255 
    256 morebytes:
    257             while (i < inBytes)
    258             {
    259                 if (mySource < sourceLimit)
    260                 {
    261                     toUBytes[i] = (char) (ch2 = *mySource);
    262                     if (!UTF8_IS_TRAIL(ch2))
    263                     {
    264                         break; /* i < inBytes */
    265                     }
    266                     ch = (ch << 6) + ch2;
    267                     ++mySource;
    268                     i++;
    269                 }
    270                 else
    271                 {
    272                     cnv->toUnicodeStatus = ch;
    273                     cnv->mode = inBytes;
    274                     cnv->toULength = (int8_t)i;
    275                     goto donefornow;
    276                 }
    277             }
    278 
    279             /* Remove the accumulated high bits */
    280             ch -= offsetsFromUTF8[inBytes];
    281 
    282             /*
    283              * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
    284              * - use only trail bytes after a lead byte (checked above)
    285              * - use the right number of trail bytes for a given lead byte
    286              * - encode a code point <= U+10ffff
    287              * - use the fewest possible number of bytes for their code points
    288              * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
    289              *
    290              * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
    291              * There are no irregular sequences any more.
    292              * In CESU-8, only surrogates, not supplementary code points, are encoded directly.
    293              */
    294             if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] &&
    295                 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch)))
    296             {
    297                 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    298                 if (ch <= MAXIMUM_UCS2)
    299                 {
    300                     /* fits in 16 bits */
    301                     *(myTarget++) = (UChar) ch;
    302                     *(myOffsets++) = offsetNum;
    303                 }
    304                 else
    305                 {
    306                     /* write out the surrogates */
    307                     ch -= HALF_BASE;
    308                     *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START);
    309                     *(myOffsets++) = offsetNum;
    310                     ch = (ch & HALF_MASK) + SURROGATE_LOW_START;
    311                     if (myTarget < targetLimit)
    312                     {
    313                         *(myTarget++) = (UChar)ch;
    314                         *(myOffsets++) = offsetNum;
    315                     }
    316                     else
    317                     {
    318                         cnv->UCharErrorBuffer[0] = (UChar) ch;
    319                         cnv->UCharErrorBufferLength = 1;
    320                         *err = U_BUFFER_OVERFLOW_ERROR;
    321                     }
    322                 }
    323                 offsetNum += i;
    324             }
    325             else
    326             {
    327                 cnv->toULength = (int8_t)i;
    328                 *err = U_ILLEGAL_CHAR_FOUND;
    329                 break;
    330             }
    331         }
    332     }
    333 
    334 donefornow:
    335     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    336     {   /* End of target buffer */
    337         *err = U_BUFFER_OVERFLOW_ERROR;
    338     }
    339 
    340     args->target = myTarget;
    341     args->source = (const char *) mySource;
    342     args->offsets = myOffsets;
    343 }
    344 
    345 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args,
    346                                     UErrorCode * err)
    347 {
    348     UConverter *cnv = args->converter;
    349     const UChar *mySource = args->source;
    350     const UChar *sourceLimit = args->sourceLimit;
    351     uint8_t *myTarget = (uint8_t *) args->target;
    352     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
    353     uint8_t *tempPtr;
    354     UChar32 ch;
    355     uint8_t tempBuf[4];
    356     int32_t indexToWrite;
    357     UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
    358 
    359     if (cnv->fromUChar32 && myTarget < targetLimit)
    360     {
    361         ch = cnv->fromUChar32;
    362         cnv->fromUChar32 = 0;
    363         goto lowsurrogate;
    364     }
    365 
    366     while (mySource < sourceLimit && myTarget < targetLimit)
    367     {
    368         ch = *(mySource++);
    369 
    370         if (ch < 0x80)        /* Single byte */
    371         {
    372             *(myTarget++) = (uint8_t) ch;
    373         }
    374         else if (ch < 0x800)  /* Double byte */
    375         {
    376             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
    377             if (myTarget < targetLimit)
    378             {
    379                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
    380             }
    381             else
    382             {
    383                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
    384                 cnv->charErrorBufferLength = 1;
    385                 *err = U_BUFFER_OVERFLOW_ERROR;
    386             }
    387         }
    388         else {
    389             /* Check for surrogates */
    390             if(UTF_IS_SURROGATE(ch) && isNotCESU8) {
    391 lowsurrogate:
    392                 if (mySource < sourceLimit) {
    393                     /* test both code units */
    394                     if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*mySource)) {
    395                         /* convert and consume this supplementary code point */
    396                         ch=UTF16_GET_PAIR_VALUE(ch, *mySource);
    397                         ++mySource;
    398                         /* exit this condition tree */
    399                     }
    400                     else {
    401                         /* this is an unpaired trail or lead code unit */
    402                         /* callback(illegal) */
    403                         cnv->fromUChar32 = ch;
    404                         *err = U_ILLEGAL_CHAR_FOUND;
    405                         break;
    406                     }
    407                 }
    408                 else {
    409                     /* no more input */
    410                     cnv->fromUChar32 = ch;
    411                     break;
    412                 }
    413             }
    414 
    415             /* Do we write the buffer directly for speed,
    416             or do we have to be careful about target buffer space? */
    417             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
    418 
    419             if (ch <= MAXIMUM_UCS2) {
    420                 indexToWrite = 2;
    421                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
    422             }
    423             else {
    424                 indexToWrite = 3;
    425                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
    426                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
    427             }
    428             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
    429             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
    430 
    431             if (tempPtr == myTarget) {
    432                 /* There was enough space to write the codepoint directly. */
    433                 myTarget += (indexToWrite + 1);
    434             }
    435             else {
    436                 /* We might run out of room soon. Write it slowly. */
    437                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
    438                     if (myTarget < targetLimit) {
    439                         *(myTarget++) = *tempPtr;
    440                     }
    441                     else {
    442                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
    443                         *err = U_BUFFER_OVERFLOW_ERROR;
    444                     }
    445                 }
    446             }
    447         }
    448     }
    449 
    450     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    451     {
    452         *err = U_BUFFER_OVERFLOW_ERROR;
    453     }
    454 
    455     args->target = (char *) myTarget;
    456     args->source = mySource;
    457 }
    458 
    459 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args,
    460                                                   UErrorCode * err)
    461 {
    462     UConverter *cnv = args->converter;
    463     const UChar *mySource = args->source;
    464     int32_t *myOffsets = args->offsets;
    465     const UChar *sourceLimit = args->sourceLimit;
    466     uint8_t *myTarget = (uint8_t *) args->target;
    467     const uint8_t *targetLimit = (uint8_t *) args->targetLimit;
    468     uint8_t *tempPtr;
    469     UChar32 ch;
    470     int32_t offsetNum, nextSourceIndex;
    471     int32_t indexToWrite;
    472     uint8_t tempBuf[4];
    473     UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data);
    474 
    475     if (cnv->fromUChar32 && myTarget < targetLimit)
    476     {
    477         ch = cnv->fromUChar32;
    478         cnv->fromUChar32 = 0;
    479         offsetNum = -1;
    480         nextSourceIndex = 0;
    481         goto lowsurrogate;
    482     } else {
    483         offsetNum = 0;
    484     }
    485 
    486     while (mySource < sourceLimit && myTarget < targetLimit)
    487     {
    488         ch = *(mySource++);
    489 
    490         if (ch < 0x80)        /* Single byte */
    491         {
    492             *(myOffsets++) = offsetNum++;
    493             *(myTarget++) = (char) ch;
    494         }
    495         else if (ch < 0x800)  /* Double byte */
    496         {
    497             *(myOffsets++) = offsetNum;
    498             *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0);
    499             if (myTarget < targetLimit)
    500             {
    501                 *(myOffsets++) = offsetNum++;
    502                 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80);
    503             }
    504             else
    505             {
    506                 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80);
    507                 cnv->charErrorBufferLength = 1;
    508                 *err = U_BUFFER_OVERFLOW_ERROR;
    509             }
    510         }
    511         else
    512         /* Check for surrogates */
    513         {
    514             nextSourceIndex = offsetNum + 1;
    515 
    516             if(UTF_IS_SURROGATE(ch) && isNotCESU8) {
    517 lowsurrogate:
    518                 if (mySource < sourceLimit) {
    519                     /* test both code units */
    520                     if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*mySource)) {
    521                         /* convert and consume this supplementary code point */
    522                         ch=UTF16_GET_PAIR_VALUE(ch, *mySource);
    523                         ++mySource;
    524                         ++nextSourceIndex;
    525                         /* exit this condition tree */
    526                     }
    527                     else {
    528                         /* this is an unpaired trail or lead code unit */
    529                         /* callback(illegal) */
    530                         cnv->fromUChar32 = ch;
    531                         *err = U_ILLEGAL_CHAR_FOUND;
    532                         break;
    533                     }
    534                 }
    535                 else {
    536                     /* no more input */
    537                     cnv->fromUChar32 = ch;
    538                     break;
    539                 }
    540             }
    541 
    542             /* Do we write the buffer directly for speed,
    543             or do we have to be careful about target buffer space? */
    544             tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf);
    545 
    546             if (ch <= MAXIMUM_UCS2) {
    547                 indexToWrite = 2;
    548                 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0);
    549             }
    550             else {
    551                 indexToWrite = 3;
    552                 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0);
    553                 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80);
    554             }
    555             tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80);
    556             tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80);
    557 
    558             if (tempPtr == myTarget) {
    559                 /* There was enough space to write the codepoint directly. */
    560                 myTarget += (indexToWrite + 1);
    561                 myOffsets[0] = offsetNum;
    562                 myOffsets[1] = offsetNum;
    563                 myOffsets[2] = offsetNum;
    564                 if (indexToWrite >= 3) {
    565                     myOffsets[3] = offsetNum;
    566                 }
    567                 myOffsets += (indexToWrite + 1);
    568             }
    569             else {
    570                 /* We might run out of room soon. Write it slowly. */
    571                 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) {
    572                     if (myTarget < targetLimit)
    573                     {
    574                         *(myOffsets++) = offsetNum;
    575                         *(myTarget++) = *tempPtr;
    576                     }
    577                     else
    578                     {
    579                         cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr;
    580                         *err = U_BUFFER_OVERFLOW_ERROR;
    581                     }
    582                 }
    583             }
    584             offsetNum = nextSourceIndex;
    585         }
    586     }
    587 
    588     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    589     {
    590         *err = U_BUFFER_OVERFLOW_ERROR;
    591     }
    592 
    593     args->target = (char *) myTarget;
    594     args->source = mySource;
    595     args->offsets = myOffsets;
    596 }
    597 
    598 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args,
    599                                                UErrorCode *err) {
    600     UConverter *cnv;
    601     const uint8_t *sourceInitial;
    602     const uint8_t *source;
    603     uint16_t extraBytesToWrite;
    604     uint8_t myByte;
    605     UChar32 ch;
    606     int8_t i, isLegalSequence;
    607 
    608     /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */
    609 
    610     cnv = args->converter;
    611     sourceInitial = source = (const uint8_t *)args->source;
    612     if (source >= (const uint8_t *)args->sourceLimit)
    613     {
    614         /* no input */
    615         *err = U_INDEX_OUTOFBOUNDS_ERROR;
    616         return 0xffff;
    617     }
    618 
    619     myByte = (uint8_t)*(source++);
    620     if (myByte < 0x80)
    621     {
    622         args->source = (const char *)source;
    623         return (UChar32)myByte;
    624     }
    625 
    626     extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte];
    627     if (extraBytesToWrite == 0) {
    628         cnv->toUBytes[0] = myByte;
    629         cnv->toULength = 1;
    630         *err = U_ILLEGAL_CHAR_FOUND;
    631         args->source = (const char *)source;
    632         return 0xffff;
    633     }
    634 
    635     /*The byte sequence is longer than the buffer area passed*/
    636     if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit)
    637     {
    638         /* check if all of the remaining bytes are trail bytes */
    639         cnv->toUBytes[0] = myByte;
    640         i = 1;
    641         *err = U_TRUNCATED_CHAR_FOUND;
    642         while(source < (const uint8_t *)args->sourceLimit) {
    643             if(U8_IS_TRAIL(myByte = *source)) {
    644                 cnv->toUBytes[i++] = myByte;
    645                 ++source;
    646             } else {
    647                 /* error even before we run out of input */
    648                 *err = U_ILLEGAL_CHAR_FOUND;
    649                 break;
    650             }
    651         }
    652         cnv->toULength = i;
    653         args->source = (const char *)source;
    654         return 0xffff;
    655     }
    656 
    657     isLegalSequence = 1;
    658     ch = myByte << 6;
    659     switch(extraBytesToWrite)
    660     {
    661       /* note: code falls through cases! (sic)*/
    662     case 6:
    663         ch += (myByte = *source);
    664         ch <<= 6;
    665         if (!UTF8_IS_TRAIL(myByte))
    666         {
    667             isLegalSequence = 0;
    668             break;
    669         }
    670         ++source;
    671     case 5:
    672         ch += (myByte = *source);
    673         ch <<= 6;
    674         if (!UTF8_IS_TRAIL(myByte))
    675         {
    676             isLegalSequence = 0;
    677             break;
    678         }
    679         ++source;
    680     case 4:
    681         ch += (myByte = *source);
    682         ch <<= 6;
    683         if (!UTF8_IS_TRAIL(myByte))
    684         {
    685             isLegalSequence = 0;
    686             break;
    687         }
    688         ++source;
    689     case 3:
    690         ch += (myByte = *source);
    691         ch <<= 6;
    692         if (!UTF8_IS_TRAIL(myByte))
    693         {
    694             isLegalSequence = 0;
    695             break;
    696         }
    697         ++source;
    698     case 2:
    699         ch += (myByte = *source);
    700         if (!UTF8_IS_TRAIL(myByte))
    701         {
    702             isLegalSequence = 0;
    703             break;
    704         }
    705         ++source;
    706     };
    707     ch -= offsetsFromUTF8[extraBytesToWrite];
    708     args->source = (const char *)source;
    709 
    710     /*
    711      * Legal UTF-8 byte sequences in Unicode 3.0.1 and up:
    712      * - use only trail bytes after a lead byte (checked above)
    713      * - use the right number of trail bytes for a given lead byte
    714      * - encode a code point <= U+10ffff
    715      * - use the fewest possible number of bytes for their code points
    716      * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[])
    717      *
    718      * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8.
    719      * There are no irregular sequences any more.
    720      */
    721     if (isLegalSequence &&
    722         (uint32_t)ch <= MAXIMUM_UTF &&
    723         (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] &&
    724         !U_IS_SURROGATE(ch)
    725     ) {
    726         return ch; /* return the code point */
    727     }
    728 
    729     for(i = 0; sourceInitial < source; ++i) {
    730         cnv->toUBytes[i] = *sourceInitial++;
    731     }
    732     cnv->toULength = i;
    733     *err = U_ILLEGAL_CHAR_FOUND;
    734     return 0xffff;
    735 }
    736 
    737 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */
    738 
    739 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */
    740 static const UChar32
    741 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 };
    742 
    743 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */
    744 static const UChar32
    745 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 };
    746 
    747 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */
    748 static void
    749 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs,
    750                   UConverterToUnicodeArgs *pToUArgs,
    751                   UErrorCode *pErrorCode) {
    752     UConverter *utf8, *cnv;
    753     const uint8_t *source, *sourceLimit;
    754     uint8_t *target;
    755     int32_t targetCapacity;
    756     int32_t count;
    757 
    758     int8_t oldToULength, toULength, toULimit;
    759 
    760     UChar32 c;
    761     uint8_t b, t1, t2;
    762 
    763     /* set up the local pointers */
    764     utf8=pToUArgs->converter;
    765     cnv=pFromUArgs->converter;
    766     source=(uint8_t *)pToUArgs->source;
    767     sourceLimit=(uint8_t *)pToUArgs->sourceLimit;
    768     target=(uint8_t *)pFromUArgs->target;
    769     targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target);
    770 
    771     /* get the converter state from the UTF-8 UConverter */
    772     c=(UChar32)utf8->toUnicodeStatus;
    773     if(c!=0) {
    774         toULength=oldToULength=utf8->toULength;
    775         toULimit=(int8_t)utf8->mode;
    776     } else {
    777         toULength=oldToULength=toULimit=0;
    778     }
    779 
    780     count=(int32_t)(sourceLimit-source)+oldToULength;
    781     if(count<toULimit) {
    782         /*
    783          * Not enough input to complete the partial character.
    784          * Jump to moreBytes below - it will not output to target.
    785          */
    786     } else if(targetCapacity<toULimit) {
    787         /*
    788          * Not enough target capacity to output the partial character.
    789          * Let the standard converter handle this.
    790          */
    791         *pErrorCode=U_USING_DEFAULT_WARNING;
    792         return;
    793     } else {
    794         /*
    795          * Use a single counter for source and target, counting the minimum of
    796          * the source length and the target capacity.
    797          * As a result, the source length is checked only once per multi-byte
    798          * character instead of twice.
    799          *
    800          * Make sure that the last byte sequence is complete, or else
    801          * stop just before it.
    802          * (The longest legal byte sequence has 3 trail bytes.)
    803          * Count oldToULength (number of source bytes from a previous buffer)
    804          * into the source length but reduce the source index by toULimit
    805          * while going back over trail bytes in order to not go back into
    806          * the bytes that will be read for finishing a partial
    807          * sequence from the previous buffer.
    808          * Let the standard converter handle edge cases.
    809          */
    810         int32_t i;
    811 
    812         if(count>targetCapacity) {
    813             count=targetCapacity;
    814         }
    815 
    816         i=0;
    817         while(i<3 && i<(count-toULimit)) {
    818             b=source[count-oldToULength-i-1];
    819             if(U8_IS_TRAIL(b)) {
    820                 ++i;
    821             } else {
    822                 if(i<utf8_countTrailBytes[b]) {
    823                     /* stop converting before the lead byte if there are not enough trail bytes for it */
    824                     count-=i+1;
    825                 }
    826                 break;
    827             }
    828         }
    829     }
    830 
    831     if(c!=0) {
    832         utf8->toUnicodeStatus=0;
    833         utf8->toULength=0;
    834         goto moreBytes;
    835         /* See note in ucnv_SBCSFromUTF8() about this goto. */
    836     }
    837 
    838     /* conversion loop */
    839     while(count>0) {
    840         b=*source++;
    841         if((int8_t)b>=0) {
    842             /* convert ASCII */
    843             *target++=b;
    844             --count;
    845             continue;
    846         } else {
    847             if(b>0xe0) {
    848                 if( /* handle U+1000..U+D7FF inline */
    849                     (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) ||
    850                                                (b==0xed && (t1 <= 0x9f))) &&
    851                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
    852                 ) {
    853                     source+=2;
    854                     *target++=b;
    855                     *target++=t1;
    856                     *target++=t2;
    857                     count-=3;
    858                     continue;
    859                 }
    860             } else if(b<0xe0) {
    861                 if( /* handle U+0080..U+07FF inline */
    862                     b>=0xc2 &&
    863                     (t1=*source) >= 0x80 && t1 <= 0xbf
    864                 ) {
    865                     ++source;
    866                     *target++=b;
    867                     *target++=t1;
    868                     count-=2;
    869                     continue;
    870                 }
    871             } else if(b==0xe0) {
    872                 if( /* handle U+0800..U+0FFF inline */
    873                     (t1=source[0]) >= 0xa0 && t1 <= 0xbf &&
    874                     (t2=source[1]) >= 0x80 && t2 <= 0xbf
    875                 ) {
    876                     source+=2;
    877                     *target++=b;
    878                     *target++=t1;
    879                     *target++=t2;
    880                     count-=3;
    881                     continue;
    882                 }
    883             }
    884 
    885             /* handle "complicated" and error cases, and continuing partial characters */
    886             oldToULength=0;
    887             toULength=1;
    888             toULimit=utf8_countTrailBytes[b]+1;
    889             c=b;
    890 moreBytes:
    891             while(toULength<toULimit) {
    892                 if(source<sourceLimit) {
    893                     b=*source;
    894                     if(U8_IS_TRAIL(b)) {
    895                         ++source;
    896                         ++toULength;
    897                         c=(c<<6)+b;
    898                     } else {
    899                         break; /* sequence too short, stop with toULength<toULimit */
    900                     }
    901                 } else {
    902                     /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */
    903                     source-=(toULength-oldToULength);
    904                     while(oldToULength<toULength) {
    905                         utf8->toUBytes[oldToULength++]=*source++;
    906                     }
    907                     utf8->toUnicodeStatus=c;
    908                     utf8->toULength=toULength;
    909                     utf8->mode=toULimit;
    910                     pToUArgs->source=(char *)source;
    911                     pFromUArgs->target=(char *)target;
    912                     return;
    913                 }
    914             }
    915 
    916             if( toULength==toULimit &&      /* consumed all trail bytes */
    917                 (toULength==3 || toULength==2) &&             /* BMP */
    918                 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] &&
    919                 (c<=0xd7ff || 0xe000<=c)    /* not a surrogate */
    920             ) {
    921                 /* legal byte sequence for BMP code point */
    922             } else if(
    923                 toULength==toULimit && toULength==4 &&
    924                 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff)
    925             ) {
    926                 /* legal byte sequence for supplementary code point */
    927             } else {
    928                 /* error handling: illegal UTF-8 byte sequence */
    929                 source-=(toULength-oldToULength);
    930                 while(oldToULength<toULength) {
    931                     utf8->toUBytes[oldToULength++]=*source++;
    932                 }
    933                 utf8->toULength=toULength;
    934                 pToUArgs->source=(char *)source;
    935                 pFromUArgs->target=(char *)target;
    936                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    937                 return;
    938             }
    939 
    940             /* copy the legal byte sequence to the target */
    941             {
    942                 int8_t i;
    943 
    944                 for(i=0; i<oldToULength; ++i) {
    945                     *target++=utf8->toUBytes[i];
    946                 }
    947                 source-=(toULength-oldToULength);
    948                 for(; i<toULength; ++i) {
    949                     *target++=*source++;
    950                 }
    951                 count-=toULength;
    952             }
    953         }
    954     }
    955 
    956     if(U_SUCCESS(*pErrorCode) && source<sourceLimit) {
    957         if(target==(const uint8_t *)pFromUArgs->targetLimit) {
    958             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    959         } else {
    960             b=*source;
    961             toULimit=utf8_countTrailBytes[b]+1;
    962             if(toULimit>(sourceLimit-source)) {
    963                 /* collect a truncated byte sequence */
    964                 toULength=0;
    965                 c=b;
    966                 for(;;) {
    967                     utf8->toUBytes[toULength++]=b;
    968                     if(++source==sourceLimit) {
    969                         /* partial byte sequence at end of source */
    970                         utf8->toUnicodeStatus=c;
    971                         utf8->toULength=toULength;
    972                         utf8->mode=toULimit;
    973                         break;
    974                     } else if(!U8_IS_TRAIL(b=*source)) {
    975                         /* lead byte in trail byte position */
    976                         utf8->toULength=toULength;
    977                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    978                         break;
    979                     }
    980                     c=(c<<6)+b;
    981                 }
    982             } else {
    983                 /* partial-sequence target overflow: fall back to the pivoting implementation */
    984                 *pErrorCode=U_USING_DEFAULT_WARNING;
    985             }
    986         }
    987     }
    988 
    989     /* write back the updated pointers */
    990     pToUArgs->source=(char *)source;
    991     pFromUArgs->target=(char *)target;
    992 }
    993 
    994 /* UTF-8 converter data ----------------------------------------------------- */
    995 
    996 static const UConverterImpl _UTF8Impl={
    997     UCNV_UTF8,
    998 
    999     NULL,
   1000     NULL,
   1001 
   1002     NULL,
   1003     NULL,
   1004     NULL,
   1005 
   1006     ucnv_toUnicode_UTF8,
   1007     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
   1008     ucnv_fromUnicode_UTF8,
   1009     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
   1010     ucnv_getNextUChar_UTF8,
   1011 
   1012     NULL,
   1013     NULL,
   1014     NULL,
   1015     NULL,
   1016     ucnv_getNonSurrogateUnicodeSet,
   1017 
   1018     ucnv_UTF8FromUTF8,
   1019     ucnv_UTF8FromUTF8
   1020 };
   1021 
   1022 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */
   1023 static const UConverterStaticData _UTF8StaticData={
   1024     sizeof(UConverterStaticData),
   1025     "UTF-8",
   1026     1208, UCNV_IBM, UCNV_UTF8,
   1027     1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */
   1028     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
   1029     0,
   1030     0,
   1031     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1032 };
   1033 
   1034 
   1035 const UConverterSharedData _UTF8Data={
   1036     sizeof(UConverterSharedData), ~((uint32_t) 0),
   1037     NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl,
   1038     0
   1039 };
   1040 
   1041 /* CESU-8 converter data ---------------------------------------------------- */
   1042 
   1043 static const UConverterImpl _CESU8Impl={
   1044     UCNV_CESU8,
   1045 
   1046     NULL,
   1047     NULL,
   1048 
   1049     NULL,
   1050     NULL,
   1051     NULL,
   1052 
   1053     ucnv_toUnicode_UTF8,
   1054     ucnv_toUnicode_UTF8_OFFSETS_LOGIC,
   1055     ucnv_fromUnicode_UTF8,
   1056     ucnv_fromUnicode_UTF8_OFFSETS_LOGIC,
   1057     NULL,
   1058 
   1059     NULL,
   1060     NULL,
   1061     NULL,
   1062     NULL,
   1063     ucnv_getCompleteUnicodeSet
   1064 };
   1065 
   1066 static const UConverterStaticData _CESU8StaticData={
   1067     sizeof(UConverterStaticData),
   1068     "CESU-8",
   1069     9400, /* CCSID for CESU-8 */
   1070     UCNV_UNKNOWN, UCNV_CESU8, 1, 3,
   1071     { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE,
   1072     0,
   1073     0,
   1074     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1075 };
   1076 
   1077 
   1078 const UConverterSharedData _CESU8Data={
   1079     sizeof(UConverterSharedData), ~((uint32_t) 0),
   1080     NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl,
   1081     0
   1082 };
   1083 
   1084 #endif
   1085