Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2001-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *
      9 * File ustrtrns.cpp
     10 *
     11 * Modification History:
     12 *
     13 *   Date        Name        Description
     14 *   9/10/2001    Ram    Creation.
     15 ******************************************************************************
     16 */
     17 
     18 /*******************************************************************************
     19  *
     20  * u_strTo* and u_strFrom* APIs
     21  * WCS functions moved to ustr_wcs.c for better modularization
     22  *
     23  *******************************************************************************
     24  */
     25 
     26 
     27 #include "unicode/putil.h"
     28 #include "unicode/ustring.h"
     29 #include "unicode/utf.h"
     30 #include "unicode/utf8.h"
     31 #include "unicode/utf16.h"
     32 #include "cstring.h"
     33 #include "cmemory.h"
     34 #include "ustr_imp.h"
     35 #include "uassert.h"
     36 
     37 U_CAPI UChar* U_EXPORT2
     38 u_strFromUTF32WithSub(UChar *dest,
     39                int32_t destCapacity,
     40                int32_t *pDestLength,
     41                const UChar32 *src,
     42                int32_t srcLength,
     43                UChar32 subchar, int32_t *pNumSubstitutions,
     44                UErrorCode *pErrorCode) {
     45     const UChar32 *srcLimit;
     46     UChar32 ch;
     47     UChar *destLimit;
     48     UChar *pDest;
     49     int32_t reqLength;
     50     int32_t numSubstitutions;
     51 
     52     /* args check */
     53     if(U_FAILURE(*pErrorCode)){
     54         return NULL;
     55     }
     56     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
     57         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
     58         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
     59     ) {
     60         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
     61         return NULL;
     62     }
     63 
     64     if(pNumSubstitutions != NULL) {
     65         *pNumSubstitutions = 0;
     66     }
     67 
     68     pDest = dest;
     69     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
     70     reqLength = 0;
     71     numSubstitutions = 0;
     72 
     73     if(srcLength < 0) {
     74         /* simple loop for conversion of a NUL-terminated BMP string */
     75         while((ch=*src) != 0 &&
     76               ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
     77             ++src;
     78             if(pDest < destLimit) {
     79                 *pDest++ = (UChar)ch;
     80             } else {
     81                 ++reqLength;
     82             }
     83         }
     84         srcLimit = src;
     85         if(ch != 0) {
     86             /* "complicated" case, find the end of the remaining string */
     87             while(*++srcLimit != 0) {}
     88         }
     89     } else {
     90       srcLimit = (src!=NULL)?(src + srcLength):NULL;
     91     }
     92 
     93     /* convert with length */
     94     while(src < srcLimit) {
     95         ch = *src++;
     96         do {
     97             /* usually "loops" once; twice only for writing subchar */
     98             if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
     99                 if(pDest < destLimit) {
    100                     *pDest++ = (UChar)ch;
    101                 } else {
    102                     ++reqLength;
    103                 }
    104                 break;
    105             } else if(0x10000 <= ch && ch <= 0x10ffff) {
    106                 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
    107                     *pDest++ = U16_LEAD(ch);
    108                     *pDest++ = U16_TRAIL(ch);
    109                 } else {
    110                     reqLength += 2;
    111                 }
    112                 break;
    113             } else if((ch = subchar) < 0) {
    114                 /* surrogate code point, or not a Unicode code point at all */
    115                 *pErrorCode = U_INVALID_CHAR_FOUND;
    116                 return NULL;
    117             } else {
    118                 ++numSubstitutions;
    119             }
    120         } while(TRUE);
    121     }
    122 
    123     reqLength += (int32_t)(pDest - dest);
    124     if(pDestLength) {
    125         *pDestLength = reqLength;
    126     }
    127     if(pNumSubstitutions != NULL) {
    128         *pNumSubstitutions = numSubstitutions;
    129     }
    130 
    131     /* Terminate the buffer */
    132     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
    133 
    134     return dest;
    135 }
    136 
    137 U_CAPI UChar* U_EXPORT2
    138 u_strFromUTF32(UChar *dest,
    139                int32_t destCapacity,
    140                int32_t *pDestLength,
    141                const UChar32 *src,
    142                int32_t srcLength,
    143                UErrorCode *pErrorCode) {
    144     return u_strFromUTF32WithSub(
    145             dest, destCapacity, pDestLength,
    146             src, srcLength,
    147             U_SENTINEL, NULL,
    148             pErrorCode);
    149 }
    150 
    151 U_CAPI UChar32* U_EXPORT2
    152 u_strToUTF32WithSub(UChar32 *dest,
    153              int32_t destCapacity,
    154              int32_t *pDestLength,
    155              const UChar *src,
    156              int32_t srcLength,
    157              UChar32 subchar, int32_t *pNumSubstitutions,
    158              UErrorCode *pErrorCode) {
    159     const UChar *srcLimit;
    160     UChar32 ch;
    161     UChar ch2;
    162     UChar32 *destLimit;
    163     UChar32 *pDest;
    164     int32_t reqLength;
    165     int32_t numSubstitutions;
    166 
    167     /* args check */
    168     if(U_FAILURE(*pErrorCode)){
    169         return NULL;
    170     }
    171     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
    172         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
    173         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    174     ) {
    175         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    176         return NULL;
    177     }
    178 
    179     if(pNumSubstitutions != NULL) {
    180         *pNumSubstitutions = 0;
    181     }
    182 
    183     pDest = dest;
    184     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
    185     reqLength = 0;
    186     numSubstitutions = 0;
    187 
    188     if(srcLength < 0) {
    189         /* simple loop for conversion of a NUL-terminated BMP string */
    190         while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
    191             ++src;
    192             if(pDest < destLimit) {
    193                 *pDest++ = ch;
    194             } else {
    195                 ++reqLength;
    196             }
    197         }
    198         srcLimit = src;
    199         if(ch != 0) {
    200             /* "complicated" case, find the end of the remaining string */
    201             while(*++srcLimit != 0) {}
    202         }
    203     } else {
    204         srcLimit = (src!=NULL)?(src + srcLength):NULL;
    205     }
    206 
    207     /* convert with length */
    208     while(src < srcLimit) {
    209         ch = *src++;
    210         if(!U16_IS_SURROGATE(ch)) {
    211             /* write or count ch below */
    212         } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
    213             ++src;
    214             ch = U16_GET_SUPPLEMENTARY(ch, ch2);
    215         } else if((ch = subchar) < 0) {
    216             /* unpaired surrogate */
    217             *pErrorCode = U_INVALID_CHAR_FOUND;
    218             return NULL;
    219         } else {
    220             ++numSubstitutions;
    221         }
    222         if(pDest < destLimit) {
    223             *pDest++ = ch;
    224         } else {
    225             ++reqLength;
    226         }
    227     }
    228 
    229     reqLength += (int32_t)(pDest - dest);
    230     if(pDestLength) {
    231         *pDestLength = reqLength;
    232     }
    233     if(pNumSubstitutions != NULL) {
    234         *pNumSubstitutions = numSubstitutions;
    235     }
    236 
    237     /* Terminate the buffer */
    238     u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
    239 
    240     return dest;
    241 }
    242 
    243 U_CAPI UChar32* U_EXPORT2
    244 u_strToUTF32(UChar32 *dest,
    245              int32_t destCapacity,
    246              int32_t *pDestLength,
    247              const UChar *src,
    248              int32_t srcLength,
    249              UErrorCode *pErrorCode) {
    250     return u_strToUTF32WithSub(
    251             dest, destCapacity, pDestLength,
    252             src, srcLength,
    253             U_SENTINEL, NULL,
    254             pErrorCode);
    255 }
    256 
    257 /* for utf8_nextCharSafeBodyTerminated() */
    258 static const UChar32
    259 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
    260 
    261 /*
    262  * Version of utf8_nextCharSafeBody() with the following differences:
    263  * - checks for NUL termination instead of length
    264  * - works with pointers instead of indexes
    265  * - always strict (strict==-1)
    266  *
    267  * *ps points to after the lead byte and will be moved to after the last trail byte.
    268  * c is the lead byte.
    269  * @return the code point, or U_SENTINEL
    270  */
    271 static UChar32
    272 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
    273     const uint8_t *s=*ps;
    274     uint8_t trail, illegal=0;
    275     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
    276     U_ASSERT(count<6);
    277     U8_MASK_LEAD_BYTE((c), count);
    278     /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
    279     switch(count) {
    280     /* each branch falls through to the next one */
    281     case 5:
    282     case 4:
    283         /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
    284         illegal=1;
    285         break;
    286     case 3:
    287         trail=(uint8_t)(*s++ - 0x80);
    288         c=(c<<6)|trail;
    289         if(trail>0x3f || c>=0x110) {
    290             /* not a trail byte, or code point>0x10ffff (outside Unicode) */
    291             illegal=1;
    292             break;
    293         }
    294     case 2: /*fall through*/
    295         trail=(uint8_t)(*s++ - 0x80);
    296         if(trail>0x3f) {
    297             /* not a trail byte */
    298             illegal=1;
    299             break;
    300         }
    301         c=(c<<6)|trail;
    302     case 1: /*fall through*/
    303         trail=(uint8_t)(*s++ - 0x80);
    304         if(trail>0x3f) {
    305             /* not a trail byte */
    306             illegal=1;
    307         }
    308         c=(c<<6)|trail;
    309         break;
    310     case 0:
    311         return U_SENTINEL;
    312     /* no default branch to optimize switch()  - all values are covered */
    313     }
    314 
    315     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
    316     /* illegal is also set if count>=4 */
    317     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
    318         /* error handling */
    319         /* don't go beyond this sequence */
    320         s=*ps;
    321         while(count>0 && U8_IS_TRAIL(*s)) {
    322             ++s;
    323             --count;
    324         }
    325         c=U_SENTINEL;
    326     }
    327     *ps=s;
    328     return c;
    329 }
    330 
    331 /*
    332  * Version of utf8_nextCharSafeBody() with the following differences:
    333  * - works with pointers instead of indexes
    334  * - always strict (strict==-1)
    335  *
    336  * *ps points to after the lead byte and will be moved to after the last trail byte.
    337  * c is the lead byte.
    338  * @return the code point, or U_SENTINEL
    339  */
    340 static UChar32
    341 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
    342     const uint8_t *s=*ps;
    343     uint8_t trail, illegal=0;
    344     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
    345     if((limit-s)>=count) {
    346         U8_MASK_LEAD_BYTE((c), count);
    347         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
    348         switch(count) {
    349         /* each branch falls through to the next one */
    350         case 5:
    351         case 4:
    352             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
    353             illegal=1;
    354             break;
    355         case 3:
    356             trail=*s++;
    357             c=(c<<6)|(trail&0x3f);
    358             if(c<0x110) {
    359                 illegal|=(trail&0xc0)^0x80;
    360             } else {
    361                 /* code point>0x10ffff, outside Unicode */
    362                 illegal=1;
    363                 break;
    364             }
    365         case 2: /*fall through*/
    366             trail=*s++;
    367             c=(c<<6)|(trail&0x3f);
    368             illegal|=(trail&0xc0)^0x80;
    369         case 1: /*fall through*/
    370             trail=*s++;
    371             c=(c<<6)|(trail&0x3f);
    372             illegal|=(trail&0xc0)^0x80;
    373             break;
    374         case 0:
    375             return U_SENTINEL;
    376         /* no default branch to optimize switch()  - all values are covered */
    377         }
    378     } else {
    379         illegal=1; /* too few bytes left */
    380     }
    381 
    382     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
    383     /* illegal is also set if count>=4 */
    384     U_ASSERT(count<sizeof(utf8_minLegal)/sizeof(utf8_minLegal[0]));
    385     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
    386         /* error handling */
    387         /* don't go beyond this sequence */
    388         s=*ps;
    389         while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
    390             ++s;
    391             --count;
    392         }
    393         c=U_SENTINEL;
    394     }
    395     *ps=s;
    396     return c;
    397 }
    398 
    399 U_CAPI UChar* U_EXPORT2
    400 u_strFromUTF8WithSub(UChar *dest,
    401               int32_t destCapacity,
    402               int32_t *pDestLength,
    403               const char* src,
    404               int32_t srcLength,
    405               UChar32 subchar, int32_t *pNumSubstitutions,
    406               UErrorCode *pErrorCode){
    407     UChar *pDest = dest;
    408     UChar *pDestLimit = dest+destCapacity;
    409     UChar32 ch;
    410     int32_t reqLength = 0;
    411     const uint8_t* pSrc = (const uint8_t*) src;
    412     uint8_t t1, t2; /* trail bytes */
    413     int32_t numSubstitutions;
    414 
    415     /* args check */
    416     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
    417         return NULL;
    418     }
    419 
    420     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
    421         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
    422         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    423     ) {
    424         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    425         return NULL;
    426     }
    427 
    428     if(pNumSubstitutions!=NULL) {
    429         *pNumSubstitutions=0;
    430     }
    431     numSubstitutions=0;
    432 
    433     /*
    434      * Inline processing of UTF-8 byte sequences:
    435      *
    436      * Byte sequences for the most common characters are handled inline in
    437      * the conversion loops. In order to reduce the path lengths for those
    438      * characters, the tests are arranged in a kind of binary search.
    439      * ASCII (<=0x7f) is checked first, followed by the dividing point
    440      * between 2- and 3-byte sequences (0xe0).
    441      * The 3-byte branch is tested first to speed up CJK text.
    442      * The compiler should combine the subtractions for the two tests for 0xe0.
    443      * Each branch then tests for the other end of its range.
    444      */
    445 
    446     if(srcLength < 0){
    447         /*
    448          * Transform a NUL-terminated string.
    449          * The code explicitly checks for NULs only in the lead byte position.
    450          * A NUL byte in the trail byte position fails the trail byte range check anyway.
    451          */
    452         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
    453             if(ch <= 0x7f){
    454                 *pDest++=(UChar)ch;
    455                 ++pSrc;
    456             } else {
    457                 if(ch > 0xe0) {
    458                     if( /* handle U+1000..U+CFFF inline */
    459                         ch <= 0xec &&
    460                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
    461                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
    462                     ) {
    463                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    464                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
    465                         pSrc += 3;
    466                         continue;
    467                     }
    468                 } else if(ch < 0xe0) {
    469                     if( /* handle U+0080..U+07FF inline */
    470                         ch >= 0xc2 &&
    471                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
    472                     ) {
    473                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
    474                         pSrc += 2;
    475                         continue;
    476                     }
    477                 }
    478 
    479                 /* function call for "complicated" and error cases */
    480                 ++pSrc; /* continue after the lead byte */
    481                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
    482                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
    483                     *pErrorCode = U_INVALID_CHAR_FOUND;
    484                     return NULL;
    485                 } else if(ch<=0xFFFF) {
    486                     *(pDest++)=(UChar)ch;
    487                 } else {
    488                     *(pDest++)=U16_LEAD(ch);
    489                     if(pDest<pDestLimit) {
    490                         *(pDest++)=U16_TRAIL(ch);
    491                     } else {
    492                         reqLength++;
    493                         break;
    494                     }
    495                 }
    496             }
    497         }
    498 
    499         /* Pre-flight the rest of the string. */
    500         while((ch = *pSrc) != 0) {
    501             if(ch <= 0x7f){
    502                 ++reqLength;
    503                 ++pSrc;
    504             } else {
    505                 if(ch > 0xe0) {
    506                     if( /* handle U+1000..U+CFFF inline */
    507                         ch <= 0xec &&
    508                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
    509                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
    510                     ) {
    511                         ++reqLength;
    512                         pSrc += 3;
    513                         continue;
    514                     }
    515                 } else if(ch < 0xe0) {
    516                     if( /* handle U+0080..U+07FF inline */
    517                         ch >= 0xc2 &&
    518                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
    519                     ) {
    520                         ++reqLength;
    521                         pSrc += 2;
    522                         continue;
    523                     }
    524                 }
    525 
    526                 /* function call for "complicated" and error cases */
    527                 ++pSrc; /* continue after the lead byte */
    528                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
    529                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
    530                     *pErrorCode = U_INVALID_CHAR_FOUND;
    531                     return NULL;
    532                 }
    533                 reqLength += U16_LENGTH(ch);
    534             }
    535         }
    536     } else /* srcLength >= 0 */ {
    537         const uint8_t *pSrcLimit = pSrc + srcLength;
    538         int32_t count;
    539 
    540         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
    541         for(;;) {
    542             /*
    543              * Each iteration of the inner loop progresses by at most 3 UTF-8
    544              * bytes and one UChar, for most characters.
    545              * For supplementary code points (4 & 2), which are rare,
    546              * there is an additional adjustment.
    547              */
    548             count = (int32_t)(pDestLimit - pDest);
    549             srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
    550             if(count > srcLength) {
    551                 count = srcLength; /* min(remaining dest, remaining src/3) */
    552             }
    553             if(count < 3) {
    554                 /*
    555                  * Too much overhead if we get near the end of the string,
    556                  * continue with the next loop.
    557                  */
    558                 break;
    559             }
    560 
    561             do {
    562                 ch = *pSrc;
    563                 if(ch <= 0x7f){
    564                     *pDest++=(UChar)ch;
    565                     ++pSrc;
    566                 } else {
    567                     if(ch > 0xe0) {
    568                         if( /* handle U+1000..U+CFFF inline */
    569                             ch <= 0xec &&
    570                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
    571                             (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
    572                         ) {
    573                             /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    574                             *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
    575                             pSrc += 3;
    576                             continue;
    577                         }
    578                     } else if(ch < 0xe0) {
    579                         if( /* handle U+0080..U+07FF inline */
    580                             ch >= 0xc2 &&
    581                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
    582                         ) {
    583                             *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
    584                             pSrc += 2;
    585                             continue;
    586                         }
    587                     }
    588 
    589                     if(ch >= 0xf0 || subchar > 0xffff) {
    590                         /*
    591                          * We may read up to six bytes and write up to two UChars,
    592                          * which we didn't account for with computing count,
    593                          * so we adjust it here.
    594                          */
    595                         if(--count == 0) {
    596                             break;
    597                         }
    598                     }
    599 
    600                     /* function call for "complicated" and error cases */
    601                     ++pSrc; /* continue after the lead byte */
    602                     ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
    603                     if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
    604                         *pErrorCode = U_INVALID_CHAR_FOUND;
    605                         return NULL;
    606                     }else if(ch<=0xFFFF){
    607                         *(pDest++)=(UChar)ch;
    608                     }else{
    609                         *(pDest++)=U16_LEAD(ch);
    610                         *(pDest++)=U16_TRAIL(ch);
    611                     }
    612                 }
    613             } while(--count > 0);
    614         }
    615 
    616         while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
    617             ch = *pSrc;
    618             if(ch <= 0x7f){
    619                 *pDest++=(UChar)ch;
    620                 ++pSrc;
    621             } else {
    622                 if(ch > 0xe0) {
    623                     if( /* handle U+1000..U+CFFF inline */
    624                         ch <= 0xec &&
    625                         ((pSrcLimit - pSrc) >= 3) &&
    626                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
    627                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
    628                     ) {
    629                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    630                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
    631                         pSrc += 3;
    632                         continue;
    633                     }
    634                 } else if(ch < 0xe0) {
    635                     if( /* handle U+0080..U+07FF inline */
    636                         ch >= 0xc2 &&
    637                         ((pSrcLimit - pSrc) >= 2) &&
    638                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
    639                     ) {
    640                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
    641                         pSrc += 2;
    642                         continue;
    643                     }
    644                 }
    645 
    646                 /* function call for "complicated" and error cases */
    647                 ++pSrc; /* continue after the lead byte */
    648                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
    649                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
    650                     *pErrorCode = U_INVALID_CHAR_FOUND;
    651                     return NULL;
    652                 }else if(ch<=0xFFFF){
    653                     *(pDest++)=(UChar)ch;
    654                 }else{
    655                     *(pDest++)=U16_LEAD(ch);
    656                     if(pDest<pDestLimit){
    657                         *(pDest++)=U16_TRAIL(ch);
    658                     }else{
    659                         reqLength++;
    660                         break;
    661                     }
    662                 }
    663             }
    664         }
    665         /* do not fill the dest buffer just count the UChars needed */
    666         while(pSrc < pSrcLimit){
    667             ch = *pSrc;
    668             if(ch <= 0x7f){
    669                 reqLength++;
    670                 ++pSrc;
    671             } else {
    672                 if(ch > 0xe0) {
    673                     if( /* handle U+1000..U+CFFF inline */
    674                         ch <= 0xec &&
    675                         ((pSrcLimit - pSrc) >= 3) &&
    676                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
    677                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
    678                     ) {
    679                         reqLength++;
    680                         pSrc += 3;
    681                         continue;
    682                     }
    683                 } else if(ch < 0xe0) {
    684                     if( /* handle U+0080..U+07FF inline */
    685                         ch >= 0xc2 &&
    686                         ((pSrcLimit - pSrc) >= 2) &&
    687                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
    688                     ) {
    689                         reqLength++;
    690                         pSrc += 2;
    691                         continue;
    692                     }
    693                 }
    694 
    695                 /* function call for "complicated" and error cases */
    696                 ++pSrc; /* continue after the lead byte */
    697                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
    698                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
    699                     *pErrorCode = U_INVALID_CHAR_FOUND;
    700                     return NULL;
    701                 }
    702                 reqLength+=U16_LENGTH(ch);
    703             }
    704         }
    705     }
    706 
    707     reqLength+=(int32_t)(pDest - dest);
    708 
    709     if(pNumSubstitutions!=NULL) {
    710         *pNumSubstitutions=numSubstitutions;
    711     }
    712 
    713     if(pDestLength){
    714         *pDestLength = reqLength;
    715     }
    716 
    717     /* Terminate the buffer */
    718     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
    719 
    720     return dest;
    721 }
    722 
    723 U_CAPI UChar* U_EXPORT2
    724 u_strFromUTF8(UChar *dest,
    725               int32_t destCapacity,
    726               int32_t *pDestLength,
    727               const char* src,
    728               int32_t srcLength,
    729               UErrorCode *pErrorCode){
    730     return u_strFromUTF8WithSub(
    731             dest, destCapacity, pDestLength,
    732             src, srcLength,
    733             U_SENTINEL, NULL,
    734             pErrorCode);
    735 }
    736 
    737 U_CAPI UChar * U_EXPORT2
    738 u_strFromUTF8Lenient(UChar *dest,
    739                      int32_t destCapacity,
    740                      int32_t *pDestLength,
    741                      const char *src,
    742                      int32_t srcLength,
    743                      UErrorCode *pErrorCode) {
    744     UChar *pDest = dest;
    745     UChar32 ch;
    746     int32_t reqLength = 0;
    747     uint8_t* pSrc = (uint8_t*) src;
    748 
    749     /* args check */
    750     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
    751         return NULL;
    752     }
    753 
    754     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
    755         (destCapacity<0) || (dest == NULL && destCapacity > 0)
    756     ) {
    757         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    758         return NULL;
    759     }
    760 
    761     if(srcLength < 0) {
    762         /* Transform a NUL-terminated string. */
    763         UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
    764         uint8_t t1, t2, t3; /* trail bytes */
    765 
    766         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
    767             if(ch < 0xc0) {
    768                 /*
    769                  * ASCII, or a trail byte in lead position which is treated like
    770                  * a single-byte sequence for better character boundary
    771                  * resynchronization after illegal sequences.
    772                  */
    773                 *pDest++=(UChar)ch;
    774                 ++pSrc;
    775                 continue;
    776             } else if(ch < 0xe0) { /* U+0080..U+07FF */
    777                 if((t1 = pSrc[1]) != 0) {
    778                     /* 0x3080 = (0xc0 << 6) + 0x80 */
    779                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
    780                     pSrc += 2;
    781                     continue;
    782                 }
    783             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    784                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
    785                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    786                     /* 0x2080 = (0x80 << 6) + 0x80 */
    787                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
    788                     pSrc += 3;
    789                     continue;
    790                 }
    791             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    792                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
    793                     pSrc += 4;
    794                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    795                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
    796                     *(pDest++) = U16_LEAD(ch);
    797                     if(pDest < pDestLimit) {
    798                         *(pDest++) = U16_TRAIL(ch);
    799                     } else {
    800                         reqLength = 1;
    801                         break;
    802                     }
    803                     continue;
    804                 }
    805             }
    806 
    807             /* truncated character at the end */
    808             *pDest++ = 0xfffd;
    809             while(*++pSrc != 0) {}
    810             break;
    811         }
    812 
    813         /* Pre-flight the rest of the string. */
    814         while((ch = *pSrc) != 0) {
    815             if(ch < 0xc0) {
    816                 /*
    817                  * ASCII, or a trail byte in lead position which is treated like
    818                  * a single-byte sequence for better character boundary
    819                  * resynchronization after illegal sequences.
    820                  */
    821                 ++reqLength;
    822                 ++pSrc;
    823                 continue;
    824             } else if(ch < 0xe0) { /* U+0080..U+07FF */
    825                 if(pSrc[1] != 0) {
    826                     ++reqLength;
    827                     pSrc += 2;
    828                     continue;
    829                 }
    830             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    831                 if(pSrc[1] != 0 && pSrc[2] != 0) {
    832                     ++reqLength;
    833                     pSrc += 3;
    834                     continue;
    835                 }
    836             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    837                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
    838                     reqLength += 2;
    839                     pSrc += 4;
    840                     continue;
    841                 }
    842             }
    843 
    844             /* truncated character at the end */
    845             ++reqLength;
    846             break;
    847         }
    848     } else /* srcLength >= 0 */ {
    849       const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
    850 
    851         /*
    852          * This function requires that if srcLength is given, then it must be
    853          * destCapatity >= srcLength so that we need not check for
    854          * destination buffer overflow in the loop.
    855          */
    856         if(destCapacity < srcLength) {
    857             if(pDestLength != NULL) {
    858                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
    859             }
    860             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
    861             return NULL;
    862         }
    863 
    864         if((pSrcLimit - pSrc) >= 4) {
    865             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
    866 
    867             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
    868             do {
    869                 ch = *pSrc++;
    870                 if(ch < 0xc0) {
    871                     /*
    872                      * ASCII, or a trail byte in lead position which is treated like
    873                      * a single-byte sequence for better character boundary
    874                      * resynchronization after illegal sequences.
    875                      */
    876                     *pDest++=(UChar)ch;
    877                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
    878                     /* 0x3080 = (0xc0 << 6) + 0x80 */
    879                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
    880                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    881                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    882                     /* 0x2080 = (0x80 << 6) + 0x80 */
    883                     ch = (ch << 12) + (*pSrc++ << 6);
    884                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
    885                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    886                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    887                     ch = (ch << 18) + (*pSrc++ << 12);
    888                     ch += *pSrc++ << 6;
    889                     ch += *pSrc++ - 0x3c82080;
    890                     *(pDest++) = U16_LEAD(ch);
    891                     *(pDest++) = U16_TRAIL(ch);
    892                 }
    893             } while(pSrc < pSrcLimit);
    894 
    895             pSrcLimit += 3; /* restore original pSrcLimit */
    896         }
    897 
    898         while(pSrc < pSrcLimit) {
    899             ch = *pSrc++;
    900             if(ch < 0xc0) {
    901                 /*
    902                  * ASCII, or a trail byte in lead position which is treated like
    903                  * a single-byte sequence for better character boundary
    904                  * resynchronization after illegal sequences.
    905                  */
    906                 *pDest++=(UChar)ch;
    907                 continue;
    908             } else if(ch < 0xe0) { /* U+0080..U+07FF */
    909                 if(pSrc < pSrcLimit) {
    910                     /* 0x3080 = (0xc0 << 6) + 0x80 */
    911                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
    912                     continue;
    913                 }
    914             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    915                 if((pSrcLimit - pSrc) >= 2) {
    916                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    917                     /* 0x2080 = (0x80 << 6) + 0x80 */
    918                     ch = (ch << 12) + (*pSrc++ << 6);
    919                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
    920                     pSrc += 3;
    921                     continue;
    922                 }
    923             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    924                 if((pSrcLimit - pSrc) >= 3) {
    925                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    926                     ch = (ch << 18) + (*pSrc++ << 12);
    927                     ch += *pSrc++ << 6;
    928                     ch += *pSrc++ - 0x3c82080;
    929                     *(pDest++) = U16_LEAD(ch);
    930                     *(pDest++) = U16_TRAIL(ch);
    931                     pSrc += 4;
    932                     continue;
    933                 }
    934             }
    935 
    936             /* truncated character at the end */
    937             *pDest++ = 0xfffd;
    938             break;
    939         }
    940     }
    941 
    942     reqLength+=(int32_t)(pDest - dest);
    943 
    944     if(pDestLength){
    945         *pDestLength = reqLength;
    946     }
    947 
    948     /* Terminate the buffer */
    949     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
    950 
    951     return dest;
    952 }
    953 
    954 static inline uint8_t *
    955 _appendUTF8(uint8_t *pDest, UChar32 c) {
    956     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
    957     if((c)<=0x7f) {
    958         *pDest++=(uint8_t)c;
    959     } else if(c<=0x7ff) {
    960         *pDest++=(uint8_t)((c>>6)|0xc0);
    961         *pDest++=(uint8_t)((c&0x3f)|0x80);
    962     } else if(c<=0xffff) {
    963         *pDest++=(uint8_t)((c>>12)|0xe0);
    964         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
    965         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
    966     } else /* if((uint32_t)(c)<=0x10ffff) */ {
    967         *pDest++=(uint8_t)(((c)>>18)|0xf0);
    968         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
    969         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
    970         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
    971     }
    972     return pDest;
    973 }
    974 
    975 
    976 U_CAPI char* U_EXPORT2
    977 u_strToUTF8WithSub(char *dest,
    978             int32_t destCapacity,
    979             int32_t *pDestLength,
    980             const UChar *pSrc,
    981             int32_t srcLength,
    982             UChar32 subchar, int32_t *pNumSubstitutions,
    983             UErrorCode *pErrorCode){
    984     int32_t reqLength=0;
    985     uint32_t ch=0,ch2=0;
    986     uint8_t *pDest = (uint8_t *)dest;
    987     uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
    988     int32_t numSubstitutions;
    989 
    990     /* args check */
    991     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
    992         return NULL;
    993     }
    994 
    995     if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
    996         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
    997         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    998     ) {
    999         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1000         return NULL;
   1001     }
   1002 
   1003     if(pNumSubstitutions!=NULL) {
   1004         *pNumSubstitutions=0;
   1005     }
   1006     numSubstitutions=0;
   1007 
   1008     if(srcLength==-1) {
   1009         while((ch=*pSrc)!=0) {
   1010             ++pSrc;
   1011             if(ch <= 0x7f) {
   1012                 if(pDest<pDestLimit) {
   1013                     *pDest++ = (uint8_t)ch;
   1014                 } else {
   1015                     reqLength = 1;
   1016                     break;
   1017                 }
   1018             } else if(ch <= 0x7ff) {
   1019                 if((pDestLimit - pDest) >= 2) {
   1020                     *pDest++=(uint8_t)((ch>>6)|0xc0);
   1021                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1022                 } else {
   1023                     reqLength = 2;
   1024                     break;
   1025                 }
   1026             } else if(ch <= 0xd7ff || ch >= 0xe000) {
   1027                 if((pDestLimit - pDest) >= 3) {
   1028                     *pDest++=(uint8_t)((ch>>12)|0xe0);
   1029                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1030                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1031                 } else {
   1032                     reqLength = 3;
   1033                     break;
   1034                 }
   1035             } else /* ch is a surrogate */ {
   1036                 int32_t length;
   1037 
   1038                 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
   1039                 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
   1040                     ++pSrc;
   1041                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
   1042                 } else if(subchar>=0) {
   1043                     ch=subchar;
   1044                     ++numSubstitutions;
   1045                 } else {
   1046                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1047                     *pErrorCode = U_INVALID_CHAR_FOUND;
   1048                     return NULL;
   1049                 }
   1050 
   1051                 length = U8_LENGTH(ch);
   1052                 if((pDestLimit - pDest) >= length) {
   1053                     /* convert and append*/
   1054                     pDest=_appendUTF8(pDest, ch);
   1055                 } else {
   1056                     reqLength = length;
   1057                     break;
   1058                 }
   1059             }
   1060         }
   1061         while((ch=*pSrc++)!=0) {
   1062             if(ch<=0x7f) {
   1063                 ++reqLength;
   1064             } else if(ch<=0x7ff) {
   1065                 reqLength+=2;
   1066             } else if(!U16_IS_SURROGATE(ch)) {
   1067                 reqLength+=3;
   1068             } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
   1069                 ++pSrc;
   1070                 reqLength+=4;
   1071             } else if(subchar>=0) {
   1072                 reqLength+=U8_LENGTH(subchar);
   1073                 ++numSubstitutions;
   1074             } else {
   1075                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1076                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1077                 return NULL;
   1078             }
   1079         }
   1080     } else {
   1081         const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
   1082         int32_t count;
   1083 
   1084         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
   1085         for(;;) {
   1086             /*
   1087              * Each iteration of the inner loop progresses by at most 3 UTF-8
   1088              * bytes and one UChar, for most characters.
   1089              * For supplementary code points (4 & 2), which are rare,
   1090              * there is an additional adjustment.
   1091              */
   1092             count = (int32_t)((pDestLimit - pDest) / 3);
   1093             srcLength = (int32_t)(pSrcLimit - pSrc);
   1094             if(count > srcLength) {
   1095                 count = srcLength; /* min(remaining dest/3, remaining src) */
   1096             }
   1097             if(count < 3) {
   1098                 /*
   1099                  * Too much overhead if we get near the end of the string,
   1100                  * continue with the next loop.
   1101                  */
   1102                 break;
   1103             }
   1104             do {
   1105                 ch=*pSrc++;
   1106                 if(ch <= 0x7f) {
   1107                     *pDest++ = (uint8_t)ch;
   1108                 } else if(ch <= 0x7ff) {
   1109                     *pDest++=(uint8_t)((ch>>6)|0xc0);
   1110                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1111                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
   1112                     *pDest++=(uint8_t)((ch>>12)|0xe0);
   1113                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1114                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1115                 } else /* ch is a surrogate */ {
   1116                     /*
   1117                      * We will read two UChars and probably output four bytes,
   1118                      * which we didn't account for with computing count,
   1119                      * so we adjust it here.
   1120                      */
   1121                     if(--count == 0) {
   1122                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
   1123                         break;  /* recompute count */
   1124                     }
   1125 
   1126                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
   1127                         ++pSrc;
   1128                         ch=U16_GET_SUPPLEMENTARY(ch, ch2);
   1129 
   1130                         /* writing 4 bytes per 2 UChars is ok */
   1131                         *pDest++=(uint8_t)((ch>>18)|0xf0);
   1132                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
   1133                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1134                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1135                     } else  {
   1136                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1137                         if(subchar>=0) {
   1138                             ch=subchar;
   1139                             ++numSubstitutions;
   1140                         } else {
   1141                             *pErrorCode = U_INVALID_CHAR_FOUND;
   1142                             return NULL;
   1143                         }
   1144 
   1145                         /* convert and append*/
   1146                         pDest=_appendUTF8(pDest, ch);
   1147                     }
   1148                 }
   1149             } while(--count > 0);
   1150         }
   1151 
   1152         while(pSrc<pSrcLimit) {
   1153             ch=*pSrc++;
   1154             if(ch <= 0x7f) {
   1155                 if(pDest<pDestLimit) {
   1156                     *pDest++ = (uint8_t)ch;
   1157                 } else {
   1158                     reqLength = 1;
   1159                     break;
   1160                 }
   1161             } else if(ch <= 0x7ff) {
   1162                 if((pDestLimit - pDest) >= 2) {
   1163                     *pDest++=(uint8_t)((ch>>6)|0xc0);
   1164                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1165                 } else {
   1166                     reqLength = 2;
   1167                     break;
   1168                 }
   1169             } else if(ch <= 0xd7ff || ch >= 0xe000) {
   1170                 if((pDestLimit - pDest) >= 3) {
   1171                     *pDest++=(uint8_t)((ch>>12)|0xe0);
   1172                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1173                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1174                 } else {
   1175                     reqLength = 3;
   1176                     break;
   1177                 }
   1178             } else /* ch is a surrogate */ {
   1179                 int32_t length;
   1180 
   1181                 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
   1182                     ++pSrc;
   1183                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
   1184                 } else if(subchar>=0) {
   1185                     ch=subchar;
   1186                     ++numSubstitutions;
   1187                 } else {
   1188                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1189                     *pErrorCode = U_INVALID_CHAR_FOUND;
   1190                     return NULL;
   1191                 }
   1192 
   1193                 length = U8_LENGTH(ch);
   1194                 if((pDestLimit - pDest) >= length) {
   1195                     /* convert and append*/
   1196                     pDest=_appendUTF8(pDest, ch);
   1197                 } else {
   1198                     reqLength = length;
   1199                     break;
   1200                 }
   1201             }
   1202         }
   1203         while(pSrc<pSrcLimit) {
   1204             ch=*pSrc++;
   1205             if(ch<=0x7f) {
   1206                 ++reqLength;
   1207             } else if(ch<=0x7ff) {
   1208                 reqLength+=2;
   1209             } else if(!U16_IS_SURROGATE(ch)) {
   1210                 reqLength+=3;
   1211             } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
   1212                 ++pSrc;
   1213                 reqLength+=4;
   1214             } else if(subchar>=0) {
   1215                 reqLength+=U8_LENGTH(subchar);
   1216                 ++numSubstitutions;
   1217             } else {
   1218                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1219                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1220                 return NULL;
   1221             }
   1222         }
   1223     }
   1224 
   1225     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
   1226 
   1227     if(pNumSubstitutions!=NULL) {
   1228         *pNumSubstitutions=numSubstitutions;
   1229     }
   1230 
   1231     if(pDestLength){
   1232         *pDestLength = reqLength;
   1233     }
   1234 
   1235     /* Terminate the buffer */
   1236     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1237     return dest;
   1238 }
   1239 
   1240 U_CAPI char* U_EXPORT2
   1241 u_strToUTF8(char *dest,
   1242             int32_t destCapacity,
   1243             int32_t *pDestLength,
   1244             const UChar *pSrc,
   1245             int32_t srcLength,
   1246             UErrorCode *pErrorCode){
   1247     return u_strToUTF8WithSub(
   1248             dest, destCapacity, pDestLength,
   1249             pSrc, srcLength,
   1250             U_SENTINEL, NULL,
   1251             pErrorCode);
   1252 }
   1253 
   1254 U_CAPI UChar* U_EXPORT2
   1255 u_strFromJavaModifiedUTF8WithSub(
   1256         UChar *dest,
   1257         int32_t destCapacity,
   1258         int32_t *pDestLength,
   1259         const char *src,
   1260         int32_t srcLength,
   1261         UChar32 subchar, int32_t *pNumSubstitutions,
   1262         UErrorCode *pErrorCode) {
   1263     UChar *pDest = dest;
   1264     UChar *pDestLimit = dest+destCapacity;
   1265     UChar32 ch;
   1266     int32_t reqLength = 0;
   1267     const uint8_t* pSrc = (const uint8_t*) src;
   1268     const uint8_t *pSrcLimit;
   1269     int32_t count;
   1270     uint8_t t1, t2; /* trail bytes */
   1271     int32_t numSubstitutions;
   1272 
   1273     /* args check */
   1274     if(U_FAILURE(*pErrorCode)){
   1275         return NULL;
   1276     }
   1277     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
   1278         (dest==NULL && destCapacity!=0) || destCapacity<0 ||
   1279         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
   1280     ) {
   1281         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1282         return NULL;
   1283     }
   1284 
   1285     if(pNumSubstitutions!=NULL) {
   1286         *pNumSubstitutions=0;
   1287     }
   1288     numSubstitutions=0;
   1289 
   1290     if(srcLength < 0) {
   1291         /*
   1292          * Transform a NUL-terminated ASCII string.
   1293          * Handle non-ASCII strings with slower code.
   1294          */
   1295         while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
   1296             *pDest++=(UChar)ch;
   1297             ++pSrc;
   1298         }
   1299         if(ch == 0) {
   1300             reqLength=(int32_t)(pDest - dest);
   1301             if(pDestLength) {
   1302                 *pDestLength = reqLength;
   1303             }
   1304 
   1305             /* Terminate the buffer */
   1306             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
   1307             return dest;
   1308         }
   1309         srcLength = uprv_strlen((const char *)pSrc);
   1310     }
   1311 
   1312     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
   1313     pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
   1314     for(;;) {
   1315         count = (int32_t)(pDestLimit - pDest);
   1316         srcLength = (int32_t)(pSrcLimit - pSrc);
   1317         if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
   1318             /* fast ASCII loop */
   1319             const uint8_t *prevSrc = pSrc;
   1320             int32_t delta;
   1321             while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
   1322                 *pDest++=(UChar)ch;
   1323                 ++pSrc;
   1324             }
   1325             delta = (int32_t)(pSrc - prevSrc);
   1326             count -= delta;
   1327             srcLength -= delta;
   1328         }
   1329         /*
   1330          * Each iteration of the inner loop progresses by at most 3 UTF-8
   1331          * bytes and one UChar.
   1332          */
   1333         srcLength /= 3;
   1334         if(count > srcLength) {
   1335             count = srcLength; /* min(remaining dest, remaining src/3) */
   1336         }
   1337         if(count < 3) {
   1338             /*
   1339              * Too much overhead if we get near the end of the string,
   1340              * continue with the next loop.
   1341              */
   1342             break;
   1343         }
   1344         do {
   1345             ch = *pSrc;
   1346             if(ch <= 0x7f){
   1347                 *pDest++=(UChar)ch;
   1348                 ++pSrc;
   1349             } else {
   1350                 if(ch >= 0xe0) {
   1351                     if( /* handle U+0000..U+FFFF inline */
   1352                         ch <= 0xef &&
   1353                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
   1354                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
   1355                     ) {
   1356                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
   1357                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
   1358                         pSrc += 3;
   1359                         continue;
   1360                     }
   1361                 } else {
   1362                     if( /* handle U+0000..U+07FF inline */
   1363                         ch >= 0xc0 &&
   1364                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
   1365                     ) {
   1366                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
   1367                         pSrc += 2;
   1368                         continue;
   1369                     }
   1370                 }
   1371 
   1372                 if(subchar < 0) {
   1373                     *pErrorCode = U_INVALID_CHAR_FOUND;
   1374                     return NULL;
   1375                 } else if(subchar > 0xffff && --count == 0) {
   1376                     /*
   1377                      * We need to write two UChars, adjusted count for that,
   1378                      * and ran out of space.
   1379                      */
   1380                     break;
   1381                 } else {
   1382                     /* function call for error cases */
   1383                     ++pSrc; /* continue after the lead byte */
   1384                     utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
   1385                     ++numSubstitutions;
   1386                     if(subchar<=0xFFFF) {
   1387                         *(pDest++)=(UChar)subchar;
   1388                     } else {
   1389                         *(pDest++)=U16_LEAD(subchar);
   1390                         *(pDest++)=U16_TRAIL(subchar);
   1391                     }
   1392                 }
   1393             }
   1394         } while(--count > 0);
   1395     }
   1396 
   1397     while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
   1398         ch = *pSrc;
   1399         if(ch <= 0x7f){
   1400             *pDest++=(UChar)ch;
   1401             ++pSrc;
   1402         } else {
   1403             if(ch >= 0xe0) {
   1404                 if( /* handle U+0000..U+FFFF inline */
   1405                     ch <= 0xef &&
   1406                     ((pSrcLimit - pSrc) >= 3) &&
   1407                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
   1408                     (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
   1409                 ) {
   1410                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
   1411                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
   1412                     pSrc += 3;
   1413                     continue;
   1414                 }
   1415             } else {
   1416                 if( /* handle U+0000..U+07FF inline */
   1417                     ch >= 0xc0 &&
   1418                     ((pSrcLimit - pSrc) >= 2) &&
   1419                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
   1420                 ) {
   1421                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
   1422                     pSrc += 2;
   1423                     continue;
   1424                 }
   1425             }
   1426 
   1427             if(subchar < 0) {
   1428                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1429                 return NULL;
   1430             } else {
   1431                 /* function call for error cases */
   1432                 ++pSrc; /* continue after the lead byte */
   1433                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
   1434                 ++numSubstitutions;
   1435                 if(subchar<=0xFFFF) {
   1436                     *(pDest++)=(UChar)subchar;
   1437                 } else {
   1438                     *(pDest++)=U16_LEAD(subchar);
   1439                     if(pDest<pDestLimit) {
   1440                         *(pDest++)=U16_TRAIL(subchar);
   1441                     } else {
   1442                         reqLength++;
   1443                         break;
   1444                     }
   1445                 }
   1446             }
   1447         }
   1448     }
   1449 
   1450     /* do not fill the dest buffer just count the UChars needed */
   1451     while(pSrc < pSrcLimit){
   1452         ch = *pSrc;
   1453         if(ch <= 0x7f) {
   1454             reqLength++;
   1455             ++pSrc;
   1456         } else {
   1457             if(ch >= 0xe0) {
   1458                 if( /* handle U+0000..U+FFFF inline */
   1459                     ch <= 0xef &&
   1460                     ((pSrcLimit - pSrc) >= 3) &&
   1461                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
   1462                     (uint8_t)(pSrc[2] - 0x80) <= 0x3f
   1463                 ) {
   1464                     reqLength++;
   1465                     pSrc += 3;
   1466                     continue;
   1467                 }
   1468             } else {
   1469                 if( /* handle U+0000..U+07FF inline */
   1470                     ch >= 0xc0 &&
   1471                     ((pSrcLimit - pSrc) >= 2) &&
   1472                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f
   1473                 ) {
   1474                     reqLength++;
   1475                     pSrc += 2;
   1476                     continue;
   1477                 }
   1478             }
   1479 
   1480             if(subchar < 0) {
   1481                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1482                 return NULL;
   1483             } else {
   1484                 /* function call for error cases */
   1485                 ++pSrc; /* continue after the lead byte */
   1486                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
   1487                 ++numSubstitutions;
   1488                 reqLength+=U16_LENGTH(ch);
   1489             }
   1490         }
   1491     }
   1492 
   1493     if(pNumSubstitutions!=NULL) {
   1494         *pNumSubstitutions=numSubstitutions;
   1495     }
   1496 
   1497     reqLength+=(int32_t)(pDest - dest);
   1498     if(pDestLength) {
   1499         *pDestLength = reqLength;
   1500     }
   1501 
   1502     /* Terminate the buffer */
   1503     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
   1504     return dest;
   1505 }
   1506 
   1507 U_CAPI char* U_EXPORT2
   1508 u_strToJavaModifiedUTF8(
   1509         char *dest,
   1510         int32_t destCapacity,
   1511         int32_t *pDestLength,
   1512         const UChar *src,
   1513         int32_t srcLength,
   1514         UErrorCode *pErrorCode) {
   1515     int32_t reqLength=0;
   1516     uint32_t ch=0;
   1517     uint8_t *pDest = (uint8_t *)dest;
   1518     uint8_t *pDestLimit = pDest + destCapacity;
   1519     const UChar *pSrcLimit;
   1520     int32_t count;
   1521 
   1522     /* args check */
   1523     if(U_FAILURE(*pErrorCode)){
   1524         return NULL;
   1525     }
   1526     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
   1527         (dest==NULL && destCapacity!=0) || destCapacity<0
   1528     ) {
   1529         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1530         return NULL;
   1531     }
   1532 
   1533     if(srcLength==-1) {
   1534         /* Convert NUL-terminated ASCII, then find the string length. */
   1535         while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
   1536             *pDest++ = (uint8_t)ch;
   1537             ++src;
   1538         }
   1539         if(ch == 0) {
   1540             reqLength=(int32_t)(pDest - (uint8_t *)dest);
   1541             if(pDestLength) {
   1542                 *pDestLength = reqLength;
   1543             }
   1544 
   1545             /* Terminate the buffer */
   1546             u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1547             return dest;
   1548         }
   1549         srcLength = u_strlen(src);
   1550     }
   1551 
   1552     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
   1553     pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
   1554     for(;;) {
   1555         count = (int32_t)(pDestLimit - pDest);
   1556         srcLength = (int32_t)(pSrcLimit - src);
   1557         if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
   1558             /* fast ASCII loop */
   1559             const UChar *prevSrc = src;
   1560             int32_t delta;
   1561             while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
   1562                 *pDest++=(uint8_t)ch;
   1563                 ++src;
   1564             }
   1565             delta = (int32_t)(src - prevSrc);
   1566             count -= delta;
   1567             srcLength -= delta;
   1568         }
   1569         /*
   1570          * Each iteration of the inner loop progresses by at most 3 UTF-8
   1571          * bytes and one UChar.
   1572          */
   1573         count /= 3;
   1574         if(count > srcLength) {
   1575             count = srcLength; /* min(remaining dest/3, remaining src) */
   1576         }
   1577         if(count < 3) {
   1578             /*
   1579              * Too much overhead if we get near the end of the string,
   1580              * continue with the next loop.
   1581              */
   1582             break;
   1583         }
   1584         do {
   1585             ch=*src++;
   1586             if(ch <= 0x7f && ch != 0) {
   1587                 *pDest++ = (uint8_t)ch;
   1588             } else if(ch <= 0x7ff) {
   1589                 *pDest++=(uint8_t)((ch>>6)|0xc0);
   1590                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1591             } else {
   1592                 *pDest++=(uint8_t)((ch>>12)|0xe0);
   1593                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1594                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1595             }
   1596         } while(--count > 0);
   1597     }
   1598 
   1599     while(src<pSrcLimit) {
   1600         ch=*src++;
   1601         if(ch <= 0x7f && ch != 0) {
   1602             if(pDest<pDestLimit) {
   1603                 *pDest++ = (uint8_t)ch;
   1604             } else {
   1605                 reqLength = 1;
   1606                 break;
   1607             }
   1608         } else if(ch <= 0x7ff) {
   1609             if((pDestLimit - pDest) >= 2) {
   1610                 *pDest++=(uint8_t)((ch>>6)|0xc0);
   1611                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1612             } else {
   1613                 reqLength = 2;
   1614                 break;
   1615             }
   1616         } else {
   1617             if((pDestLimit - pDest) >= 3) {
   1618                 *pDest++=(uint8_t)((ch>>12)|0xe0);
   1619                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1620                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1621             } else {
   1622                 reqLength = 3;
   1623                 break;
   1624             }
   1625         }
   1626     }
   1627     while(src<pSrcLimit) {
   1628         ch=*src++;
   1629         if(ch <= 0x7f && ch != 0) {
   1630             ++reqLength;
   1631         } else if(ch<=0x7ff) {
   1632             reqLength+=2;
   1633         } else {
   1634             reqLength+=3;
   1635         }
   1636     }
   1637 
   1638     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
   1639     if(pDestLength){
   1640         *pDestLength = reqLength;
   1641     }
   1642 
   1643     /* Terminate the buffer */
   1644     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1645     return dest;
   1646 }
   1647