Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2001-2013, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *
      9 * File ustrtrns.cpp
     10 *
     11 * Modification History:
     12 *
     13 *   Date        Name        Description
     14 *   9/10/2001    Ram    Creation.
     15 ******************************************************************************
     16 */
     17 
     18 /*******************************************************************************
     19  *
     20  * u_strTo* and u_strFrom* APIs
     21  * WCS functions moved to ustr_wcs.c for better modularization
     22  *
     23  *******************************************************************************
     24  */
     25 
     26 
     27 #include "unicode/putil.h"
     28 #include "unicode/ustring.h"
     29 #include "unicode/utf.h"
     30 #include "unicode/utf8.h"
     31 #include "unicode/utf16.h"
     32 #include "cstring.h"
     33 #include "cmemory.h"
     34 #include "ustr_imp.h"
     35 #include "uassert.h"
     36 
     37 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     38 
     39 U_CAPI UChar* U_EXPORT2
     40 u_strFromUTF32WithSub(UChar *dest,
     41                int32_t destCapacity,
     42                int32_t *pDestLength,
     43                const UChar32 *src,
     44                int32_t srcLength,
     45                UChar32 subchar, int32_t *pNumSubstitutions,
     46                UErrorCode *pErrorCode) {
     47     const UChar32 *srcLimit;
     48     UChar32 ch;
     49     UChar *destLimit;
     50     UChar *pDest;
     51     int32_t reqLength;
     52     int32_t numSubstitutions;
     53 
     54     /* args check */
     55     if(U_FAILURE(*pErrorCode)){
     56         return NULL;
     57     }
     58     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
     59         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
     60         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
     61     ) {
     62         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
     63         return NULL;
     64     }
     65 
     66     if(pNumSubstitutions != NULL) {
     67         *pNumSubstitutions = 0;
     68     }
     69 
     70     pDest = dest;
     71     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
     72     reqLength = 0;
     73     numSubstitutions = 0;
     74 
     75     if(srcLength < 0) {
     76         /* simple loop for conversion of a NUL-terminated BMP string */
     77         while((ch=*src) != 0 &&
     78               ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
     79             ++src;
     80             if(pDest < destLimit) {
     81                 *pDest++ = (UChar)ch;
     82             } else {
     83                 ++reqLength;
     84             }
     85         }
     86         srcLimit = src;
     87         if(ch != 0) {
     88             /* "complicated" case, find the end of the remaining string */
     89             while(*++srcLimit != 0) {}
     90         }
     91     } else {
     92       srcLimit = (src!=NULL)?(src + srcLength):NULL;
     93     }
     94 
     95     /* convert with length */
     96     while(src < srcLimit) {
     97         ch = *src++;
     98         do {
     99             /* usually "loops" once; twice only for writing subchar */
    100             if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
    101                 if(pDest < destLimit) {
    102                     *pDest++ = (UChar)ch;
    103                 } else {
    104                     ++reqLength;
    105                 }
    106                 break;
    107             } else if(0x10000 <= ch && ch <= 0x10ffff) {
    108                 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
    109                     *pDest++ = U16_LEAD(ch);
    110                     *pDest++ = U16_TRAIL(ch);
    111                 } else {
    112                     reqLength += 2;
    113                 }
    114                 break;
    115             } else if((ch = subchar) < 0) {
    116                 /* surrogate code point, or not a Unicode code point at all */
    117                 *pErrorCode = U_INVALID_CHAR_FOUND;
    118                 return NULL;
    119             } else {
    120                 ++numSubstitutions;
    121             }
    122         } while(TRUE);
    123     }
    124 
    125     reqLength += (int32_t)(pDest - dest);
    126     if(pDestLength) {
    127         *pDestLength = reqLength;
    128     }
    129     if(pNumSubstitutions != NULL) {
    130         *pNumSubstitutions = numSubstitutions;
    131     }
    132 
    133     /* Terminate the buffer */
    134     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
    135 
    136     return dest;
    137 }
    138 
    139 U_CAPI UChar* U_EXPORT2
    140 u_strFromUTF32(UChar *dest,
    141                int32_t destCapacity,
    142                int32_t *pDestLength,
    143                const UChar32 *src,
    144                int32_t srcLength,
    145                UErrorCode *pErrorCode) {
    146     return u_strFromUTF32WithSub(
    147             dest, destCapacity, pDestLength,
    148             src, srcLength,
    149             U_SENTINEL, NULL,
    150             pErrorCode);
    151 }
    152 
    153 U_CAPI UChar32* U_EXPORT2
    154 u_strToUTF32WithSub(UChar32 *dest,
    155              int32_t destCapacity,
    156              int32_t *pDestLength,
    157              const UChar *src,
    158              int32_t srcLength,
    159              UChar32 subchar, int32_t *pNumSubstitutions,
    160              UErrorCode *pErrorCode) {
    161     const UChar *srcLimit;
    162     UChar32 ch;
    163     UChar ch2;
    164     UChar32 *destLimit;
    165     UChar32 *pDest;
    166     int32_t reqLength;
    167     int32_t numSubstitutions;
    168 
    169     /* args check */
    170     if(U_FAILURE(*pErrorCode)){
    171         return NULL;
    172     }
    173     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
    174         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
    175         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    176     ) {
    177         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    178         return NULL;
    179     }
    180 
    181     if(pNumSubstitutions != NULL) {
    182         *pNumSubstitutions = 0;
    183     }
    184 
    185     pDest = dest;
    186     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
    187     reqLength = 0;
    188     numSubstitutions = 0;
    189 
    190     if(srcLength < 0) {
    191         /* simple loop for conversion of a NUL-terminated BMP string */
    192         while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
    193             ++src;
    194             if(pDest < destLimit) {
    195                 *pDest++ = ch;
    196             } else {
    197                 ++reqLength;
    198             }
    199         }
    200         srcLimit = src;
    201         if(ch != 0) {
    202             /* "complicated" case, find the end of the remaining string */
    203             while(*++srcLimit != 0) {}
    204         }
    205     } else {
    206         srcLimit = (src!=NULL)?(src + srcLength):NULL;
    207     }
    208 
    209     /* convert with length */
    210     while(src < srcLimit) {
    211         ch = *src++;
    212         if(!U16_IS_SURROGATE(ch)) {
    213             /* write or count ch below */
    214         } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
    215             ++src;
    216             ch = U16_GET_SUPPLEMENTARY(ch, ch2);
    217         } else if((ch = subchar) < 0) {
    218             /* unpaired surrogate */
    219             *pErrorCode = U_INVALID_CHAR_FOUND;
    220             return NULL;
    221         } else {
    222             ++numSubstitutions;
    223         }
    224         if(pDest < destLimit) {
    225             *pDest++ = ch;
    226         } else {
    227             ++reqLength;
    228         }
    229     }
    230 
    231     reqLength += (int32_t)(pDest - dest);
    232     if(pDestLength) {
    233         *pDestLength = reqLength;
    234     }
    235     if(pNumSubstitutions != NULL) {
    236         *pNumSubstitutions = numSubstitutions;
    237     }
    238 
    239     /* Terminate the buffer */
    240     u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
    241 
    242     return dest;
    243 }
    244 
    245 U_CAPI UChar32* U_EXPORT2
    246 u_strToUTF32(UChar32 *dest,
    247              int32_t destCapacity,
    248              int32_t *pDestLength,
    249              const UChar *src,
    250              int32_t srcLength,
    251              UErrorCode *pErrorCode) {
    252     return u_strToUTF32WithSub(
    253             dest, destCapacity, pDestLength,
    254             src, srcLength,
    255             U_SENTINEL, NULL,
    256             pErrorCode);
    257 }
    258 
    259 /* for utf8_nextCharSafeBodyTerminated() */
    260 static const UChar32
    261 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
    262 
    263 /*
    264  * Version of utf8_nextCharSafeBody() with the following differences:
    265  * - checks for NUL termination instead of length
    266  * - works with pointers instead of indexes
    267  * - always strict (strict==-1)
    268  *
    269  * *ps points to after the lead byte and will be moved to after the last trail byte.
    270  * c is the lead byte.
    271  * @return the code point, or U_SENTINEL
    272  */
    273 static UChar32
    274 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
    275     const uint8_t *s=*ps;
    276     uint8_t trail, illegal=0;
    277     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
    278     U_ASSERT(count<6);
    279     U8_MASK_LEAD_BYTE((c), count);
    280     /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
    281     switch(count) {
    282     /* each branch falls through to the next one */
    283     case 5:
    284     case 4:
    285         /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
    286         illegal=1;
    287         break;
    288     case 3:
    289         trail=(uint8_t)(*s++ - 0x80);
    290         c=(c<<6)|trail;
    291         if(trail>0x3f || c>=0x110) {
    292             /* not a trail byte, or code point>0x10ffff (outside Unicode) */
    293             illegal=1;
    294             break;
    295         }
    296     case 2: /*fall through*/
    297         trail=(uint8_t)(*s++ - 0x80);
    298         if(trail>0x3f) {
    299             /* not a trail byte */
    300             illegal=1;
    301             break;
    302         }
    303         c=(c<<6)|trail;
    304     case 1: /*fall through*/
    305         trail=(uint8_t)(*s++ - 0x80);
    306         if(trail>0x3f) {
    307             /* not a trail byte */
    308             illegal=1;
    309         }
    310         c=(c<<6)|trail;
    311         break;
    312     case 0:
    313         return U_SENTINEL;
    314     /* no default branch to optimize switch()  - all values are covered */
    315     }
    316 
    317     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
    318     /* illegal is also set if count>=4 */
    319     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
    320         /* error handling */
    321         /* don't go beyond this sequence */
    322         s=*ps;
    323         while(count>0 && U8_IS_TRAIL(*s)) {
    324             ++s;
    325             --count;
    326         }
    327         c=U_SENTINEL;
    328     }
    329     *ps=s;
    330     return c;
    331 }
    332 
    333 /*
    334  * Version of utf8_nextCharSafeBody() with the following differences:
    335  * - works with pointers instead of indexes
    336  * - always strict (strict==-1)
    337  *
    338  * *ps points to after the lead byte and will be moved to after the last trail byte.
    339  * c is the lead byte.
    340  * @return the code point, or U_SENTINEL
    341  */
    342 static UChar32
    343 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
    344     const uint8_t *s=*ps;
    345     uint8_t trail, illegal=0;
    346     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
    347     if((limit-s)>=count) {
    348         U8_MASK_LEAD_BYTE((c), count);
    349         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
    350         switch(count) {
    351         /* each branch falls through to the next one */
    352         case 5:
    353         case 4:
    354             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
    355             illegal=1;
    356             break;
    357         case 3:
    358             trail=*s++;
    359             c=(c<<6)|(trail&0x3f);
    360             if(c<0x110) {
    361                 illegal|=(trail&0xc0)^0x80;
    362             } else {
    363                 /* code point>0x10ffff, outside Unicode */
    364                 illegal=1;
    365                 break;
    366             }
    367         case 2: /*fall through*/
    368             trail=*s++;
    369             c=(c<<6)|(trail&0x3f);
    370             illegal|=(trail&0xc0)^0x80;
    371         case 1: /*fall through*/
    372             trail=*s++;
    373             c=(c<<6)|(trail&0x3f);
    374             illegal|=(trail&0xc0)^0x80;
    375             break;
    376         case 0:
    377             return U_SENTINEL;
    378         /* no default branch to optimize switch()  - all values are covered */
    379         }
    380     } else {
    381         illegal=1; /* too few bytes left */
    382     }
    383 
    384     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
    385     /* illegal is also set if count>=4 */
    386     U_ASSERT(illegal || count<LENGTHOF(utf8_minLegal));
    387     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
    388         /* error handling */
    389         /* don't go beyond this sequence */
    390         s=*ps;
    391         while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
    392             ++s;
    393             --count;
    394         }
    395         c=U_SENTINEL;
    396     }
    397     *ps=s;
    398     return c;
    399 }
    400 
    401 U_CAPI UChar* U_EXPORT2
    402 u_strFromUTF8WithSub(UChar *dest,
    403               int32_t destCapacity,
    404               int32_t *pDestLength,
    405               const char* src,
    406               int32_t srcLength,
    407               UChar32 subchar, int32_t *pNumSubstitutions,
    408               UErrorCode *pErrorCode){
    409     UChar *pDest = dest;
    410     UChar *pDestLimit = dest+destCapacity;
    411     UChar32 ch;
    412     int32_t reqLength = 0;
    413     const uint8_t* pSrc = (const uint8_t*) src;
    414     uint8_t t1, t2; /* trail bytes */
    415     int32_t numSubstitutions;
    416 
    417     /* args check */
    418     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
    419         return NULL;
    420     }
    421 
    422     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
    423         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
    424         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    425     ) {
    426         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    427         return NULL;
    428     }
    429 
    430     if(pNumSubstitutions!=NULL) {
    431         *pNumSubstitutions=0;
    432     }
    433     numSubstitutions=0;
    434 
    435     /*
    436      * Inline processing of UTF-8 byte sequences:
    437      *
    438      * Byte sequences for the most common characters are handled inline in
    439      * the conversion loops. In order to reduce the path lengths for those
    440      * characters, the tests are arranged in a kind of binary search.
    441      * ASCII (<=0x7f) is checked first, followed by the dividing point
    442      * between 2- and 3-byte sequences (0xe0).
    443      * The 3-byte branch is tested first to speed up CJK text.
    444      * The compiler should combine the subtractions for the two tests for 0xe0.
    445      * Each branch then tests for the other end of its range.
    446      */
    447 
    448     if(srcLength < 0){
    449         /*
    450          * Transform a NUL-terminated string.
    451          * The code explicitly checks for NULs only in the lead byte position.
    452          * A NUL byte in the trail byte position fails the trail byte range check anyway.
    453          */
    454         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
    455             if(ch <= 0x7f){
    456                 *pDest++=(UChar)ch;
    457                 ++pSrc;
    458             } else {
    459                 if(ch > 0xe0) {
    460                     if( /* handle U+1000..U+CFFF inline */
    461                         ch <= 0xec &&
    462                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
    463                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
    464                     ) {
    465                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    466                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
    467                         pSrc += 3;
    468                         continue;
    469                     }
    470                 } else if(ch < 0xe0) {
    471                     if( /* handle U+0080..U+07FF inline */
    472                         ch >= 0xc2 &&
    473                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
    474                     ) {
    475                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
    476                         pSrc += 2;
    477                         continue;
    478                     }
    479                 }
    480 
    481                 /* function call for "complicated" and error cases */
    482                 ++pSrc; /* continue after the lead byte */
    483                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
    484                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
    485                     *pErrorCode = U_INVALID_CHAR_FOUND;
    486                     return NULL;
    487                 } else if(ch<=0xFFFF) {
    488                     *(pDest++)=(UChar)ch;
    489                 } else {
    490                     *(pDest++)=U16_LEAD(ch);
    491                     if(pDest<pDestLimit) {
    492                         *(pDest++)=U16_TRAIL(ch);
    493                     } else {
    494                         reqLength++;
    495                         break;
    496                     }
    497                 }
    498             }
    499         }
    500 
    501         /* Pre-flight the rest of the string. */
    502         while((ch = *pSrc) != 0) {
    503             if(ch <= 0x7f){
    504                 ++reqLength;
    505                 ++pSrc;
    506             } else {
    507                 if(ch > 0xe0) {
    508                     if( /* handle U+1000..U+CFFF inline */
    509                         ch <= 0xec &&
    510                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
    511                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
    512                     ) {
    513                         ++reqLength;
    514                         pSrc += 3;
    515                         continue;
    516                     }
    517                 } else if(ch < 0xe0) {
    518                     if( /* handle U+0080..U+07FF inline */
    519                         ch >= 0xc2 &&
    520                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
    521                     ) {
    522                         ++reqLength;
    523                         pSrc += 2;
    524                         continue;
    525                     }
    526                 }
    527 
    528                 /* function call for "complicated" and error cases */
    529                 ++pSrc; /* continue after the lead byte */
    530                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
    531                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
    532                     *pErrorCode = U_INVALID_CHAR_FOUND;
    533                     return NULL;
    534                 }
    535                 reqLength += U16_LENGTH(ch);
    536             }
    537         }
    538     } else /* srcLength >= 0 */ {
    539         const uint8_t *pSrcLimit = pSrc + srcLength;
    540         int32_t count;
    541 
    542         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
    543         for(;;) {
    544             /*
    545              * Each iteration of the inner loop progresses by at most 3 UTF-8
    546              * bytes and one UChar, for most characters.
    547              * For supplementary code points (4 & 2), which are rare,
    548              * there is an additional adjustment.
    549              */
    550             count = (int32_t)(pDestLimit - pDest);
    551             srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
    552             if(count > srcLength) {
    553                 count = srcLength; /* min(remaining dest, remaining src/3) */
    554             }
    555             if(count < 3) {
    556                 /*
    557                  * Too much overhead if we get near the end of the string,
    558                  * continue with the next loop.
    559                  */
    560                 break;
    561             }
    562 
    563             do {
    564                 ch = *pSrc;
    565                 if(ch <= 0x7f){
    566                     *pDest++=(UChar)ch;
    567                     ++pSrc;
    568                 } else {
    569                     if(ch > 0xe0) {
    570                         if( /* handle U+1000..U+CFFF inline */
    571                             ch <= 0xec &&
    572                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
    573                             (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
    574                         ) {
    575                             /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    576                             *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
    577                             pSrc += 3;
    578                             continue;
    579                         }
    580                     } else if(ch < 0xe0) {
    581                         if( /* handle U+0080..U+07FF inline */
    582                             ch >= 0xc2 &&
    583                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
    584                         ) {
    585                             *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
    586                             pSrc += 2;
    587                             continue;
    588                         }
    589                     }
    590 
    591                     if(ch >= 0xf0 || subchar > 0xffff) {
    592                         /*
    593                          * We may read up to six bytes and write up to two UChars,
    594                          * which we didn't account for with computing count,
    595                          * so we adjust it here.
    596                          */
    597                         if(--count == 0) {
    598                             break;
    599                         }
    600                     }
    601 
    602                     /* function call for "complicated" and error cases */
    603                     ++pSrc; /* continue after the lead byte */
    604                     ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
    605                     if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
    606                         *pErrorCode = U_INVALID_CHAR_FOUND;
    607                         return NULL;
    608                     }else if(ch<=0xFFFF){
    609                         *(pDest++)=(UChar)ch;
    610                     }else{
    611                         *(pDest++)=U16_LEAD(ch);
    612                         *(pDest++)=U16_TRAIL(ch);
    613                     }
    614                 }
    615             } while(--count > 0);
    616         }
    617 
    618         while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
    619             ch = *pSrc;
    620             if(ch <= 0x7f){
    621                 *pDest++=(UChar)ch;
    622                 ++pSrc;
    623             } else {
    624                 if(ch > 0xe0) {
    625                     if( /* handle U+1000..U+CFFF inline */
    626                         ch <= 0xec &&
    627                         ((pSrcLimit - pSrc) >= 3) &&
    628                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
    629                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
    630                     ) {
    631                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    632                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
    633                         pSrc += 3;
    634                         continue;
    635                     }
    636                 } else if(ch < 0xe0) {
    637                     if( /* handle U+0080..U+07FF inline */
    638                         ch >= 0xc2 &&
    639                         ((pSrcLimit - pSrc) >= 2) &&
    640                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
    641                     ) {
    642                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
    643                         pSrc += 2;
    644                         continue;
    645                     }
    646                 }
    647 
    648                 /* function call for "complicated" and error cases */
    649                 ++pSrc; /* continue after the lead byte */
    650                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
    651                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
    652                     *pErrorCode = U_INVALID_CHAR_FOUND;
    653                     return NULL;
    654                 }else if(ch<=0xFFFF){
    655                     *(pDest++)=(UChar)ch;
    656                 }else{
    657                     *(pDest++)=U16_LEAD(ch);
    658                     if(pDest<pDestLimit){
    659                         *(pDest++)=U16_TRAIL(ch);
    660                     }else{
    661                         reqLength++;
    662                         break;
    663                     }
    664                 }
    665             }
    666         }
    667         /* do not fill the dest buffer just count the UChars needed */
    668         while(pSrc < pSrcLimit){
    669             ch = *pSrc;
    670             if(ch <= 0x7f){
    671                 reqLength++;
    672                 ++pSrc;
    673             } else {
    674                 if(ch > 0xe0) {
    675                     if( /* handle U+1000..U+CFFF inline */
    676                         ch <= 0xec &&
    677                         ((pSrcLimit - pSrc) >= 3) &&
    678                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
    679                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
    680                     ) {
    681                         reqLength++;
    682                         pSrc += 3;
    683                         continue;
    684                     }
    685                 } else if(ch < 0xe0) {
    686                     if( /* handle U+0080..U+07FF inline */
    687                         ch >= 0xc2 &&
    688                         ((pSrcLimit - pSrc) >= 2) &&
    689                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
    690                     ) {
    691                         reqLength++;
    692                         pSrc += 2;
    693                         continue;
    694                     }
    695                 }
    696 
    697                 /* function call for "complicated" and error cases */
    698                 ++pSrc; /* continue after the lead byte */
    699                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
    700                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
    701                     *pErrorCode = U_INVALID_CHAR_FOUND;
    702                     return NULL;
    703                 }
    704                 reqLength+=U16_LENGTH(ch);
    705             }
    706         }
    707     }
    708 
    709     reqLength+=(int32_t)(pDest - dest);
    710 
    711     if(pNumSubstitutions!=NULL) {
    712         *pNumSubstitutions=numSubstitutions;
    713     }
    714 
    715     if(pDestLength){
    716         *pDestLength = reqLength;
    717     }
    718 
    719     /* Terminate the buffer */
    720     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
    721 
    722     return dest;
    723 }
    724 
    725 U_CAPI UChar* U_EXPORT2
    726 u_strFromUTF8(UChar *dest,
    727               int32_t destCapacity,
    728               int32_t *pDestLength,
    729               const char* src,
    730               int32_t srcLength,
    731               UErrorCode *pErrorCode){
    732     return u_strFromUTF8WithSub(
    733             dest, destCapacity, pDestLength,
    734             src, srcLength,
    735             U_SENTINEL, NULL,
    736             pErrorCode);
    737 }
    738 
    739 U_CAPI UChar * U_EXPORT2
    740 u_strFromUTF8Lenient(UChar *dest,
    741                      int32_t destCapacity,
    742                      int32_t *pDestLength,
    743                      const char *src,
    744                      int32_t srcLength,
    745                      UErrorCode *pErrorCode) {
    746     UChar *pDest = dest;
    747     UChar32 ch;
    748     int32_t reqLength = 0;
    749     uint8_t* pSrc = (uint8_t*) src;
    750 
    751     /* args check */
    752     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
    753         return NULL;
    754     }
    755 
    756     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
    757         (destCapacity<0) || (dest == NULL && destCapacity > 0)
    758     ) {
    759         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    760         return NULL;
    761     }
    762 
    763     if(srcLength < 0) {
    764         /* Transform a NUL-terminated string. */
    765         UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
    766         uint8_t t1, t2, t3; /* trail bytes */
    767 
    768         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
    769             if(ch < 0xc0) {
    770                 /*
    771                  * ASCII, or a trail byte in lead position which is treated like
    772                  * a single-byte sequence for better character boundary
    773                  * resynchronization after illegal sequences.
    774                  */
    775                 *pDest++=(UChar)ch;
    776                 ++pSrc;
    777                 continue;
    778             } else if(ch < 0xe0) { /* U+0080..U+07FF */
    779                 if((t1 = pSrc[1]) != 0) {
    780                     /* 0x3080 = (0xc0 << 6) + 0x80 */
    781                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
    782                     pSrc += 2;
    783                     continue;
    784                 }
    785             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    786                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
    787                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    788                     /* 0x2080 = (0x80 << 6) + 0x80 */
    789                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
    790                     pSrc += 3;
    791                     continue;
    792                 }
    793             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    794                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
    795                     pSrc += 4;
    796                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    797                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
    798                     *(pDest++) = U16_LEAD(ch);
    799                     if(pDest < pDestLimit) {
    800                         *(pDest++) = U16_TRAIL(ch);
    801                     } else {
    802                         reqLength = 1;
    803                         break;
    804                     }
    805                     continue;
    806                 }
    807             }
    808 
    809             /* truncated character at the end */
    810             *pDest++ = 0xfffd;
    811             while(*++pSrc != 0) {}
    812             break;
    813         }
    814 
    815         /* Pre-flight the rest of the string. */
    816         while((ch = *pSrc) != 0) {
    817             if(ch < 0xc0) {
    818                 /*
    819                  * ASCII, or a trail byte in lead position which is treated like
    820                  * a single-byte sequence for better character boundary
    821                  * resynchronization after illegal sequences.
    822                  */
    823                 ++reqLength;
    824                 ++pSrc;
    825                 continue;
    826             } else if(ch < 0xe0) { /* U+0080..U+07FF */
    827                 if(pSrc[1] != 0) {
    828                     ++reqLength;
    829                     pSrc += 2;
    830                     continue;
    831                 }
    832             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    833                 if(pSrc[1] != 0 && pSrc[2] != 0) {
    834                     ++reqLength;
    835                     pSrc += 3;
    836                     continue;
    837                 }
    838             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    839                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
    840                     reqLength += 2;
    841                     pSrc += 4;
    842                     continue;
    843                 }
    844             }
    845 
    846             /* truncated character at the end */
    847             ++reqLength;
    848             break;
    849         }
    850     } else /* srcLength >= 0 */ {
    851       const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
    852 
    853         /*
    854          * This function requires that if srcLength is given, then it must be
    855          * destCapatity >= srcLength so that we need not check for
    856          * destination buffer overflow in the loop.
    857          */
    858         if(destCapacity < srcLength) {
    859             if(pDestLength != NULL) {
    860                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
    861             }
    862             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
    863             return NULL;
    864         }
    865 
    866         if((pSrcLimit - pSrc) >= 4) {
    867             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
    868 
    869             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
    870             do {
    871                 ch = *pSrc++;
    872                 if(ch < 0xc0) {
    873                     /*
    874                      * ASCII, or a trail byte in lead position which is treated like
    875                      * a single-byte sequence for better character boundary
    876                      * resynchronization after illegal sequences.
    877                      */
    878                     *pDest++=(UChar)ch;
    879                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
    880                     /* 0x3080 = (0xc0 << 6) + 0x80 */
    881                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
    882                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    883                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    884                     /* 0x2080 = (0x80 << 6) + 0x80 */
    885                     ch = (ch << 12) + (*pSrc++ << 6);
    886                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
    887                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    888                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    889                     ch = (ch << 18) + (*pSrc++ << 12);
    890                     ch += *pSrc++ << 6;
    891                     ch += *pSrc++ - 0x3c82080;
    892                     *(pDest++) = U16_LEAD(ch);
    893                     *(pDest++) = U16_TRAIL(ch);
    894                 }
    895             } while(pSrc < pSrcLimit);
    896 
    897             pSrcLimit += 3; /* restore original pSrcLimit */
    898         }
    899 
    900         while(pSrc < pSrcLimit) {
    901             ch = *pSrc++;
    902             if(ch < 0xc0) {
    903                 /*
    904                  * ASCII, or a trail byte in lead position which is treated like
    905                  * a single-byte sequence for better character boundary
    906                  * resynchronization after illegal sequences.
    907                  */
    908                 *pDest++=(UChar)ch;
    909                 continue;
    910             } else if(ch < 0xe0) { /* U+0080..U+07FF */
    911                 if(pSrc < pSrcLimit) {
    912                     /* 0x3080 = (0xc0 << 6) + 0x80 */
    913                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
    914                     continue;
    915                 }
    916             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    917                 if((pSrcLimit - pSrc) >= 2) {
    918                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    919                     /* 0x2080 = (0x80 << 6) + 0x80 */
    920                     ch = (ch << 12) + (*pSrc++ << 6);
    921                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
    922                     pSrc += 3;
    923                     continue;
    924                 }
    925             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    926                 if((pSrcLimit - pSrc) >= 3) {
    927                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    928                     ch = (ch << 18) + (*pSrc++ << 12);
    929                     ch += *pSrc++ << 6;
    930                     ch += *pSrc++ - 0x3c82080;
    931                     *(pDest++) = U16_LEAD(ch);
    932                     *(pDest++) = U16_TRAIL(ch);
    933                     pSrc += 4;
    934                     continue;
    935                 }
    936             }
    937 
    938             /* truncated character at the end */
    939             *pDest++ = 0xfffd;
    940             break;
    941         }
    942     }
    943 
    944     reqLength+=(int32_t)(pDest - dest);
    945 
    946     if(pDestLength){
    947         *pDestLength = reqLength;
    948     }
    949 
    950     /* Terminate the buffer */
    951     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
    952 
    953     return dest;
    954 }
    955 
    956 static inline uint8_t *
    957 _appendUTF8(uint8_t *pDest, UChar32 c) {
    958     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
    959     if((c)<=0x7f) {
    960         *pDest++=(uint8_t)c;
    961     } else if(c<=0x7ff) {
    962         *pDest++=(uint8_t)((c>>6)|0xc0);
    963         *pDest++=(uint8_t)((c&0x3f)|0x80);
    964     } else if(c<=0xffff) {
    965         *pDest++=(uint8_t)((c>>12)|0xe0);
    966         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
    967         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
    968     } else /* if((uint32_t)(c)<=0x10ffff) */ {
    969         *pDest++=(uint8_t)(((c)>>18)|0xf0);
    970         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
    971         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
    972         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
    973     }
    974     return pDest;
    975 }
    976 
    977 
    978 U_CAPI char* U_EXPORT2
    979 u_strToUTF8WithSub(char *dest,
    980             int32_t destCapacity,
    981             int32_t *pDestLength,
    982             const UChar *pSrc,
    983             int32_t srcLength,
    984             UChar32 subchar, int32_t *pNumSubstitutions,
    985             UErrorCode *pErrorCode){
    986     int32_t reqLength=0;
    987     uint32_t ch=0,ch2=0;
    988     uint8_t *pDest = (uint8_t *)dest;
    989     uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
    990     int32_t numSubstitutions;
    991 
    992     /* args check */
    993     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
    994         return NULL;
    995     }
    996 
    997     if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
    998         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
    999         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
   1000     ) {
   1001         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1002         return NULL;
   1003     }
   1004 
   1005     if(pNumSubstitutions!=NULL) {
   1006         *pNumSubstitutions=0;
   1007     }
   1008     numSubstitutions=0;
   1009 
   1010     if(srcLength==-1) {
   1011         while((ch=*pSrc)!=0) {
   1012             ++pSrc;
   1013             if(ch <= 0x7f) {
   1014                 if(pDest<pDestLimit) {
   1015                     *pDest++ = (uint8_t)ch;
   1016                 } else {
   1017                     reqLength = 1;
   1018                     break;
   1019                 }
   1020             } else if(ch <= 0x7ff) {
   1021                 if((pDestLimit - pDest) >= 2) {
   1022                     *pDest++=(uint8_t)((ch>>6)|0xc0);
   1023                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1024                 } else {
   1025                     reqLength = 2;
   1026                     break;
   1027                 }
   1028             } else if(ch <= 0xd7ff || ch >= 0xe000) {
   1029                 if((pDestLimit - pDest) >= 3) {
   1030                     *pDest++=(uint8_t)((ch>>12)|0xe0);
   1031                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1032                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1033                 } else {
   1034                     reqLength = 3;
   1035                     break;
   1036                 }
   1037             } else /* ch is a surrogate */ {
   1038                 int32_t length;
   1039 
   1040                 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
   1041                 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
   1042                     ++pSrc;
   1043                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
   1044                 } else if(subchar>=0) {
   1045                     ch=subchar;
   1046                     ++numSubstitutions;
   1047                 } else {
   1048                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1049                     *pErrorCode = U_INVALID_CHAR_FOUND;
   1050                     return NULL;
   1051                 }
   1052 
   1053                 length = U8_LENGTH(ch);
   1054                 if((pDestLimit - pDest) >= length) {
   1055                     /* convert and append*/
   1056                     pDest=_appendUTF8(pDest, ch);
   1057                 } else {
   1058                     reqLength = length;
   1059                     break;
   1060                 }
   1061             }
   1062         }
   1063         while((ch=*pSrc++)!=0) {
   1064             if(ch<=0x7f) {
   1065                 ++reqLength;
   1066             } else if(ch<=0x7ff) {
   1067                 reqLength+=2;
   1068             } else if(!U16_IS_SURROGATE(ch)) {
   1069                 reqLength+=3;
   1070             } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
   1071                 ++pSrc;
   1072                 reqLength+=4;
   1073             } else if(subchar>=0) {
   1074                 reqLength+=U8_LENGTH(subchar);
   1075                 ++numSubstitutions;
   1076             } else {
   1077                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1078                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1079                 return NULL;
   1080             }
   1081         }
   1082     } else {
   1083         const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
   1084         int32_t count;
   1085 
   1086         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
   1087         for(;;) {
   1088             /*
   1089              * Each iteration of the inner loop progresses by at most 3 UTF-8
   1090              * bytes and one UChar, for most characters.
   1091              * For supplementary code points (4 & 2), which are rare,
   1092              * there is an additional adjustment.
   1093              */
   1094             count = (int32_t)((pDestLimit - pDest) / 3);
   1095             srcLength = (int32_t)(pSrcLimit - pSrc);
   1096             if(count > srcLength) {
   1097                 count = srcLength; /* min(remaining dest/3, remaining src) */
   1098             }
   1099             if(count < 3) {
   1100                 /*
   1101                  * Too much overhead if we get near the end of the string,
   1102                  * continue with the next loop.
   1103                  */
   1104                 break;
   1105             }
   1106             do {
   1107                 ch=*pSrc++;
   1108                 if(ch <= 0x7f) {
   1109                     *pDest++ = (uint8_t)ch;
   1110                 } else if(ch <= 0x7ff) {
   1111                     *pDest++=(uint8_t)((ch>>6)|0xc0);
   1112                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1113                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
   1114                     *pDest++=(uint8_t)((ch>>12)|0xe0);
   1115                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1116                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1117                 } else /* ch is a surrogate */ {
   1118                     /*
   1119                      * We will read two UChars and probably output four bytes,
   1120                      * which we didn't account for with computing count,
   1121                      * so we adjust it here.
   1122                      */
   1123                     if(--count == 0) {
   1124                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
   1125                         break;  /* recompute count */
   1126                     }
   1127 
   1128                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
   1129                         ++pSrc;
   1130                         ch=U16_GET_SUPPLEMENTARY(ch, ch2);
   1131 
   1132                         /* writing 4 bytes per 2 UChars is ok */
   1133                         *pDest++=(uint8_t)((ch>>18)|0xf0);
   1134                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
   1135                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1136                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1137                     } else  {
   1138                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1139                         if(subchar>=0) {
   1140                             ch=subchar;
   1141                             ++numSubstitutions;
   1142                         } else {
   1143                             *pErrorCode = U_INVALID_CHAR_FOUND;
   1144                             return NULL;
   1145                         }
   1146 
   1147                         /* convert and append*/
   1148                         pDest=_appendUTF8(pDest, ch);
   1149                     }
   1150                 }
   1151             } while(--count > 0);
   1152         }
   1153 
   1154         while(pSrc<pSrcLimit) {
   1155             ch=*pSrc++;
   1156             if(ch <= 0x7f) {
   1157                 if(pDest<pDestLimit) {
   1158                     *pDest++ = (uint8_t)ch;
   1159                 } else {
   1160                     reqLength = 1;
   1161                     break;
   1162                 }
   1163             } else if(ch <= 0x7ff) {
   1164                 if((pDestLimit - pDest) >= 2) {
   1165                     *pDest++=(uint8_t)((ch>>6)|0xc0);
   1166                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1167                 } else {
   1168                     reqLength = 2;
   1169                     break;
   1170                 }
   1171             } else if(ch <= 0xd7ff || ch >= 0xe000) {
   1172                 if((pDestLimit - pDest) >= 3) {
   1173                     *pDest++=(uint8_t)((ch>>12)|0xe0);
   1174                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1175                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1176                 } else {
   1177                     reqLength = 3;
   1178                     break;
   1179                 }
   1180             } else /* ch is a surrogate */ {
   1181                 int32_t length;
   1182 
   1183                 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
   1184                     ++pSrc;
   1185                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
   1186                 } else if(subchar>=0) {
   1187                     ch=subchar;
   1188                     ++numSubstitutions;
   1189                 } else {
   1190                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1191                     *pErrorCode = U_INVALID_CHAR_FOUND;
   1192                     return NULL;
   1193                 }
   1194 
   1195                 length = U8_LENGTH(ch);
   1196                 if((pDestLimit - pDest) >= length) {
   1197                     /* convert and append*/
   1198                     pDest=_appendUTF8(pDest, ch);
   1199                 } else {
   1200                     reqLength = length;
   1201                     break;
   1202                 }
   1203             }
   1204         }
   1205         while(pSrc<pSrcLimit) {
   1206             ch=*pSrc++;
   1207             if(ch<=0x7f) {
   1208                 ++reqLength;
   1209             } else if(ch<=0x7ff) {
   1210                 reqLength+=2;
   1211             } else if(!U16_IS_SURROGATE(ch)) {
   1212                 reqLength+=3;
   1213             } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
   1214                 ++pSrc;
   1215                 reqLength+=4;
   1216             } else if(subchar>=0) {
   1217                 reqLength+=U8_LENGTH(subchar);
   1218                 ++numSubstitutions;
   1219             } else {
   1220                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1221                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1222                 return NULL;
   1223             }
   1224         }
   1225     }
   1226 
   1227     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
   1228 
   1229     if(pNumSubstitutions!=NULL) {
   1230         *pNumSubstitutions=numSubstitutions;
   1231     }
   1232 
   1233     if(pDestLength){
   1234         *pDestLength = reqLength;
   1235     }
   1236 
   1237     /* Terminate the buffer */
   1238     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1239     return dest;
   1240 }
   1241 
   1242 U_CAPI char* U_EXPORT2
   1243 u_strToUTF8(char *dest,
   1244             int32_t destCapacity,
   1245             int32_t *pDestLength,
   1246             const UChar *pSrc,
   1247             int32_t srcLength,
   1248             UErrorCode *pErrorCode){
   1249     return u_strToUTF8WithSub(
   1250             dest, destCapacity, pDestLength,
   1251             pSrc, srcLength,
   1252             U_SENTINEL, NULL,
   1253             pErrorCode);
   1254 }
   1255 
   1256 U_CAPI UChar* U_EXPORT2
   1257 u_strFromJavaModifiedUTF8WithSub(
   1258         UChar *dest,
   1259         int32_t destCapacity,
   1260         int32_t *pDestLength,
   1261         const char *src,
   1262         int32_t srcLength,
   1263         UChar32 subchar, int32_t *pNumSubstitutions,
   1264         UErrorCode *pErrorCode) {
   1265     UChar *pDest = dest;
   1266     UChar *pDestLimit = dest+destCapacity;
   1267     UChar32 ch;
   1268     int32_t reqLength = 0;
   1269     const uint8_t* pSrc = (const uint8_t*) src;
   1270     const uint8_t *pSrcLimit;
   1271     int32_t count;
   1272     uint8_t t1, t2; /* trail bytes */
   1273     int32_t numSubstitutions;
   1274 
   1275     /* args check */
   1276     if(U_FAILURE(*pErrorCode)){
   1277         return NULL;
   1278     }
   1279     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
   1280         (dest==NULL && destCapacity!=0) || destCapacity<0 ||
   1281         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
   1282     ) {
   1283         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1284         return NULL;
   1285     }
   1286 
   1287     if(pNumSubstitutions!=NULL) {
   1288         *pNumSubstitutions=0;
   1289     }
   1290     numSubstitutions=0;
   1291 
   1292     if(srcLength < 0) {
   1293         /*
   1294          * Transform a NUL-terminated ASCII string.
   1295          * Handle non-ASCII strings with slower code.
   1296          */
   1297         while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
   1298             *pDest++=(UChar)ch;
   1299             ++pSrc;
   1300         }
   1301         if(ch == 0) {
   1302             reqLength=(int32_t)(pDest - dest);
   1303             if(pDestLength) {
   1304                 *pDestLength = reqLength;
   1305             }
   1306 
   1307             /* Terminate the buffer */
   1308             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
   1309             return dest;
   1310         }
   1311         srcLength = uprv_strlen((const char *)pSrc);
   1312     }
   1313 
   1314     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
   1315     pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
   1316     for(;;) {
   1317         count = (int32_t)(pDestLimit - pDest);
   1318         srcLength = (int32_t)(pSrcLimit - pSrc);
   1319         if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
   1320             /* fast ASCII loop */
   1321             const uint8_t *prevSrc = pSrc;
   1322             int32_t delta;
   1323             while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
   1324                 *pDest++=(UChar)ch;
   1325                 ++pSrc;
   1326             }
   1327             delta = (int32_t)(pSrc - prevSrc);
   1328             count -= delta;
   1329             srcLength -= delta;
   1330         }
   1331         /*
   1332          * Each iteration of the inner loop progresses by at most 3 UTF-8
   1333          * bytes and one UChar.
   1334          */
   1335         srcLength /= 3;
   1336         if(count > srcLength) {
   1337             count = srcLength; /* min(remaining dest, remaining src/3) */
   1338         }
   1339         if(count < 3) {
   1340             /*
   1341              * Too much overhead if we get near the end of the string,
   1342              * continue with the next loop.
   1343              */
   1344             break;
   1345         }
   1346         do {
   1347             ch = *pSrc;
   1348             if(ch <= 0x7f){
   1349                 *pDest++=(UChar)ch;
   1350                 ++pSrc;
   1351             } else {
   1352                 if(ch >= 0xe0) {
   1353                     if( /* handle U+0000..U+FFFF inline */
   1354                         ch <= 0xef &&
   1355                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
   1356                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
   1357                     ) {
   1358                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
   1359                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
   1360                         pSrc += 3;
   1361                         continue;
   1362                     }
   1363                 } else {
   1364                     if( /* handle U+0000..U+07FF inline */
   1365                         ch >= 0xc0 &&
   1366                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
   1367                     ) {
   1368                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
   1369                         pSrc += 2;
   1370                         continue;
   1371                     }
   1372                 }
   1373 
   1374                 if(subchar < 0) {
   1375                     *pErrorCode = U_INVALID_CHAR_FOUND;
   1376                     return NULL;
   1377                 } else if(subchar > 0xffff && --count == 0) {
   1378                     /*
   1379                      * We need to write two UChars, adjusted count for that,
   1380                      * and ran out of space.
   1381                      */
   1382                     break;
   1383                 } else {
   1384                     /* function call for error cases */
   1385                     ++pSrc; /* continue after the lead byte */
   1386                     utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
   1387                     ++numSubstitutions;
   1388                     if(subchar<=0xFFFF) {
   1389                         *(pDest++)=(UChar)subchar;
   1390                     } else {
   1391                         *(pDest++)=U16_LEAD(subchar);
   1392                         *(pDest++)=U16_TRAIL(subchar);
   1393                     }
   1394                 }
   1395             }
   1396         } while(--count > 0);
   1397     }
   1398 
   1399     while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
   1400         ch = *pSrc;
   1401         if(ch <= 0x7f){
   1402             *pDest++=(UChar)ch;
   1403             ++pSrc;
   1404         } else {
   1405             if(ch >= 0xe0) {
   1406                 if( /* handle U+0000..U+FFFF inline */
   1407                     ch <= 0xef &&
   1408                     ((pSrcLimit - pSrc) >= 3) &&
   1409                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
   1410                     (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
   1411                 ) {
   1412                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
   1413                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
   1414                     pSrc += 3;
   1415                     continue;
   1416                 }
   1417             } else {
   1418                 if( /* handle U+0000..U+07FF inline */
   1419                     ch >= 0xc0 &&
   1420                     ((pSrcLimit - pSrc) >= 2) &&
   1421                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
   1422                 ) {
   1423                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
   1424                     pSrc += 2;
   1425                     continue;
   1426                 }
   1427             }
   1428 
   1429             if(subchar < 0) {
   1430                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1431                 return NULL;
   1432             } else {
   1433                 /* function call for error cases */
   1434                 ++pSrc; /* continue after the lead byte */
   1435                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
   1436                 ++numSubstitutions;
   1437                 if(subchar<=0xFFFF) {
   1438                     *(pDest++)=(UChar)subchar;
   1439                 } else {
   1440                     *(pDest++)=U16_LEAD(subchar);
   1441                     if(pDest<pDestLimit) {
   1442                         *(pDest++)=U16_TRAIL(subchar);
   1443                     } else {
   1444                         reqLength++;
   1445                         break;
   1446                     }
   1447                 }
   1448             }
   1449         }
   1450     }
   1451 
   1452     /* do not fill the dest buffer just count the UChars needed */
   1453     while(pSrc < pSrcLimit){
   1454         ch = *pSrc;
   1455         if(ch <= 0x7f) {
   1456             reqLength++;
   1457             ++pSrc;
   1458         } else {
   1459             if(ch >= 0xe0) {
   1460                 if( /* handle U+0000..U+FFFF inline */
   1461                     ch <= 0xef &&
   1462                     ((pSrcLimit - pSrc) >= 3) &&
   1463                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
   1464                     (uint8_t)(pSrc[2] - 0x80) <= 0x3f
   1465                 ) {
   1466                     reqLength++;
   1467                     pSrc += 3;
   1468                     continue;
   1469                 }
   1470             } else {
   1471                 if( /* handle U+0000..U+07FF inline */
   1472                     ch >= 0xc0 &&
   1473                     ((pSrcLimit - pSrc) >= 2) &&
   1474                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f
   1475                 ) {
   1476                     reqLength++;
   1477                     pSrc += 2;
   1478                     continue;
   1479                 }
   1480             }
   1481 
   1482             if(subchar < 0) {
   1483                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1484                 return NULL;
   1485             } else {
   1486                 /* function call for error cases */
   1487                 ++pSrc; /* continue after the lead byte */
   1488                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
   1489                 ++numSubstitutions;
   1490                 reqLength+=U16_LENGTH(ch);
   1491             }
   1492         }
   1493     }
   1494 
   1495     if(pNumSubstitutions!=NULL) {
   1496         *pNumSubstitutions=numSubstitutions;
   1497     }
   1498 
   1499     reqLength+=(int32_t)(pDest - dest);
   1500     if(pDestLength) {
   1501         *pDestLength = reqLength;
   1502     }
   1503 
   1504     /* Terminate the buffer */
   1505     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
   1506     return dest;
   1507 }
   1508 
   1509 U_CAPI char* U_EXPORT2
   1510 u_strToJavaModifiedUTF8(
   1511         char *dest,
   1512         int32_t destCapacity,
   1513         int32_t *pDestLength,
   1514         const UChar *src,
   1515         int32_t srcLength,
   1516         UErrorCode *pErrorCode) {
   1517     int32_t reqLength=0;
   1518     uint32_t ch=0;
   1519     uint8_t *pDest = (uint8_t *)dest;
   1520     uint8_t *pDestLimit = pDest + destCapacity;
   1521     const UChar *pSrcLimit;
   1522     int32_t count;
   1523 
   1524     /* args check */
   1525     if(U_FAILURE(*pErrorCode)){
   1526         return NULL;
   1527     }
   1528     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
   1529         (dest==NULL && destCapacity!=0) || destCapacity<0
   1530     ) {
   1531         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1532         return NULL;
   1533     }
   1534 
   1535     if(srcLength==-1) {
   1536         /* Convert NUL-terminated ASCII, then find the string length. */
   1537         while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
   1538             *pDest++ = (uint8_t)ch;
   1539             ++src;
   1540         }
   1541         if(ch == 0) {
   1542             reqLength=(int32_t)(pDest - (uint8_t *)dest);
   1543             if(pDestLength) {
   1544                 *pDestLength = reqLength;
   1545             }
   1546 
   1547             /* Terminate the buffer */
   1548             u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1549             return dest;
   1550         }
   1551         srcLength = u_strlen(src);
   1552     }
   1553 
   1554     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
   1555     pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
   1556     for(;;) {
   1557         count = (int32_t)(pDestLimit - pDest);
   1558         srcLength = (int32_t)(pSrcLimit - src);
   1559         if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
   1560             /* fast ASCII loop */
   1561             const UChar *prevSrc = src;
   1562             int32_t delta;
   1563             while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
   1564                 *pDest++=(uint8_t)ch;
   1565                 ++src;
   1566             }
   1567             delta = (int32_t)(src - prevSrc);
   1568             count -= delta;
   1569             srcLength -= delta;
   1570         }
   1571         /*
   1572          * Each iteration of the inner loop progresses by at most 3 UTF-8
   1573          * bytes and one UChar.
   1574          */
   1575         count /= 3;
   1576         if(count > srcLength) {
   1577             count = srcLength; /* min(remaining dest/3, remaining src) */
   1578         }
   1579         if(count < 3) {
   1580             /*
   1581              * Too much overhead if we get near the end of the string,
   1582              * continue with the next loop.
   1583              */
   1584             break;
   1585         }
   1586         do {
   1587             ch=*src++;
   1588             if(ch <= 0x7f && ch != 0) {
   1589                 *pDest++ = (uint8_t)ch;
   1590             } else if(ch <= 0x7ff) {
   1591                 *pDest++=(uint8_t)((ch>>6)|0xc0);
   1592                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1593             } else {
   1594                 *pDest++=(uint8_t)((ch>>12)|0xe0);
   1595                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1596                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1597             }
   1598         } while(--count > 0);
   1599     }
   1600 
   1601     while(src<pSrcLimit) {
   1602         ch=*src++;
   1603         if(ch <= 0x7f && ch != 0) {
   1604             if(pDest<pDestLimit) {
   1605                 *pDest++ = (uint8_t)ch;
   1606             } else {
   1607                 reqLength = 1;
   1608                 break;
   1609             }
   1610         } else if(ch <= 0x7ff) {
   1611             if((pDestLimit - pDest) >= 2) {
   1612                 *pDest++=(uint8_t)((ch>>6)|0xc0);
   1613                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1614             } else {
   1615                 reqLength = 2;
   1616                 break;
   1617             }
   1618         } else {
   1619             if((pDestLimit - pDest) >= 3) {
   1620                 *pDest++=(uint8_t)((ch>>12)|0xe0);
   1621                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1622                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1623             } else {
   1624                 reqLength = 3;
   1625                 break;
   1626             }
   1627         }
   1628     }
   1629     while(src<pSrcLimit) {
   1630         ch=*src++;
   1631         if(ch <= 0x7f && ch != 0) {
   1632             ++reqLength;
   1633         } else if(ch<=0x7ff) {
   1634             reqLength+=2;
   1635         } else {
   1636             reqLength+=3;
   1637         }
   1638     }
   1639 
   1640     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
   1641     if(pDestLength){
   1642         *pDestLength = reqLength;
   1643     }
   1644 
   1645     /* Terminate the buffer */
   1646     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1647     return dest;
   1648 }
   1649