Home | History | Annotate | Download | only in common
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2001-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *
     11 * File ustrtrns.cpp
     12 *
     13 * Modification History:
     14 *
     15 *   Date        Name        Description
     16 *   9/10/2001    Ram    Creation.
     17 ******************************************************************************
     18 */
     19 
     20 /*******************************************************************************
     21  *
     22  * u_strTo* and u_strFrom* APIs
     23  * WCS functions moved to ustr_wcs.c for better modularization
     24  *
     25  *******************************************************************************
     26  */
     27 
     28 
     29 #include "unicode/putil.h"
     30 #include "unicode/ustring.h"
     31 #include "unicode/utf.h"
     32 #include "unicode/utf8.h"
     33 #include "unicode/utf16.h"
     34 #include "cstring.h"
     35 #include "cmemory.h"
     36 #include "ustr_imp.h"
     37 #include "uassert.h"
     38 
     39 U_CAPI UChar* U_EXPORT2
     40 u_strFromUTF32WithSub(UChar *dest,
     41                int32_t destCapacity,
     42                int32_t *pDestLength,
     43                const UChar32 *src,
     44                int32_t srcLength,
     45                UChar32 subchar, int32_t *pNumSubstitutions,
     46                UErrorCode *pErrorCode) {
     47     const UChar32 *srcLimit;
     48     UChar32 ch;
     49     UChar *destLimit;
     50     UChar *pDest;
     51     int32_t reqLength;
     52     int32_t numSubstitutions;
     53 
     54     /* args check */
     55     if(U_FAILURE(*pErrorCode)){
     56         return NULL;
     57     }
     58     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
     59         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
     60         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
     61     ) {
     62         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
     63         return NULL;
     64     }
     65 
     66     if(pNumSubstitutions != NULL) {
     67         *pNumSubstitutions = 0;
     68     }
     69 
     70     pDest = dest;
     71     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
     72     reqLength = 0;
     73     numSubstitutions = 0;
     74 
     75     if(srcLength < 0) {
     76         /* simple loop for conversion of a NUL-terminated BMP string */
     77         while((ch=*src) != 0 &&
     78               ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
     79             ++src;
     80             if(pDest < destLimit) {
     81                 *pDest++ = (UChar)ch;
     82             } else {
     83                 ++reqLength;
     84             }
     85         }
     86         srcLimit = src;
     87         if(ch != 0) {
     88             /* "complicated" case, find the end of the remaining string */
     89             while(*++srcLimit != 0) {}
     90         }
     91     } else {
     92       srcLimit = (src!=NULL)?(src + srcLength):NULL;
     93     }
     94 
     95     /* convert with length */
     96     while(src < srcLimit) {
     97         ch = *src++;
     98         do {
     99             /* usually "loops" once; twice only for writing subchar */
    100             if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
    101                 if(pDest < destLimit) {
    102                     *pDest++ = (UChar)ch;
    103                 } else {
    104                     ++reqLength;
    105                 }
    106                 break;
    107             } else if(0x10000 <= ch && ch <= 0x10ffff) {
    108                 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
    109                     *pDest++ = U16_LEAD(ch);
    110                     *pDest++ = U16_TRAIL(ch);
    111                 } else {
    112                     reqLength += 2;
    113                 }
    114                 break;
    115             } else if((ch = subchar) < 0) {
    116                 /* surrogate code point, or not a Unicode code point at all */
    117                 *pErrorCode = U_INVALID_CHAR_FOUND;
    118                 return NULL;
    119             } else {
    120                 ++numSubstitutions;
    121             }
    122         } while(TRUE);
    123     }
    124 
    125     reqLength += (int32_t)(pDest - dest);
    126     if(pDestLength) {
    127         *pDestLength = reqLength;
    128     }
    129     if(pNumSubstitutions != NULL) {
    130         *pNumSubstitutions = numSubstitutions;
    131     }
    132 
    133     /* Terminate the buffer */
    134     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
    135 
    136     return dest;
    137 }
    138 
    139 U_CAPI UChar* U_EXPORT2
    140 u_strFromUTF32(UChar *dest,
    141                int32_t destCapacity,
    142                int32_t *pDestLength,
    143                const UChar32 *src,
    144                int32_t srcLength,
    145                UErrorCode *pErrorCode) {
    146     return u_strFromUTF32WithSub(
    147             dest, destCapacity, pDestLength,
    148             src, srcLength,
    149             U_SENTINEL, NULL,
    150             pErrorCode);
    151 }
    152 
    153 U_CAPI UChar32* U_EXPORT2
    154 u_strToUTF32WithSub(UChar32 *dest,
    155              int32_t destCapacity,
    156              int32_t *pDestLength,
    157              const UChar *src,
    158              int32_t srcLength,
    159              UChar32 subchar, int32_t *pNumSubstitutions,
    160              UErrorCode *pErrorCode) {
    161     const UChar *srcLimit;
    162     UChar32 ch;
    163     UChar ch2;
    164     UChar32 *destLimit;
    165     UChar32 *pDest;
    166     int32_t reqLength;
    167     int32_t numSubstitutions;
    168 
    169     /* args check */
    170     if(U_FAILURE(*pErrorCode)){
    171         return NULL;
    172     }
    173     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
    174         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
    175         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    176     ) {
    177         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    178         return NULL;
    179     }
    180 
    181     if(pNumSubstitutions != NULL) {
    182         *pNumSubstitutions = 0;
    183     }
    184 
    185     pDest = dest;
    186     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
    187     reqLength = 0;
    188     numSubstitutions = 0;
    189 
    190     if(srcLength < 0) {
    191         /* simple loop for conversion of a NUL-terminated BMP string */
    192         while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
    193             ++src;
    194             if(pDest < destLimit) {
    195                 *pDest++ = ch;
    196             } else {
    197                 ++reqLength;
    198             }
    199         }
    200         srcLimit = src;
    201         if(ch != 0) {
    202             /* "complicated" case, find the end of the remaining string */
    203             while(*++srcLimit != 0) {}
    204         }
    205     } else {
    206         srcLimit = (src!=NULL)?(src + srcLength):NULL;
    207     }
    208 
    209     /* convert with length */
    210     while(src < srcLimit) {
    211         ch = *src++;
    212         if(!U16_IS_SURROGATE(ch)) {
    213             /* write or count ch below */
    214         } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
    215             ++src;
    216             ch = U16_GET_SUPPLEMENTARY(ch, ch2);
    217         } else if((ch = subchar) < 0) {
    218             /* unpaired surrogate */
    219             *pErrorCode = U_INVALID_CHAR_FOUND;
    220             return NULL;
    221         } else {
    222             ++numSubstitutions;
    223         }
    224         if(pDest < destLimit) {
    225             *pDest++ = ch;
    226         } else {
    227             ++reqLength;
    228         }
    229     }
    230 
    231     reqLength += (int32_t)(pDest - dest);
    232     if(pDestLength) {
    233         *pDestLength = reqLength;
    234     }
    235     if(pNumSubstitutions != NULL) {
    236         *pNumSubstitutions = numSubstitutions;
    237     }
    238 
    239     /* Terminate the buffer */
    240     u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
    241 
    242     return dest;
    243 }
    244 
    245 U_CAPI UChar32* U_EXPORT2
    246 u_strToUTF32(UChar32 *dest,
    247              int32_t destCapacity,
    248              int32_t *pDestLength,
    249              const UChar *src,
    250              int32_t srcLength,
    251              UErrorCode *pErrorCode) {
    252     return u_strToUTF32WithSub(
    253             dest, destCapacity, pDestLength,
    254             src, srcLength,
    255             U_SENTINEL, NULL,
    256             pErrorCode);
    257 }
    258 
    259 /* for utf8_nextCharSafeBodyTerminated() */
    260 static const UChar32
    261 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
    262 
    263 /*
    264  * Version of utf8_nextCharSafeBody() with the following differences:
    265  * - checks for NUL termination instead of length
    266  * - works with pointers instead of indexes
    267  * - always strict (strict==-1)
    268  *
    269  * *ps points to after the lead byte and will be moved to after the last trail byte.
    270  * c is the lead byte.
    271  * @return the code point, or U_SENTINEL
    272  */
    273 static UChar32
    274 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
    275     const uint8_t *s=*ps;
    276     uint8_t trail, illegal=0;
    277     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
    278     U_ASSERT(count<6);
    279     U8_MASK_LEAD_BYTE((c), count);
    280     /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
    281     switch(count) {
    282     /* each branch falls through to the next one */
    283     case 5:
    284     case 4:
    285         /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
    286         illegal=1;
    287         break;
    288     case 3:
    289         trail=(uint8_t)(*s++ - 0x80);
    290         c=(c<<6)|trail;
    291         if(trail>0x3f || c>=0x110) {
    292             /* not a trail byte, or code point>0x10ffff (outside Unicode) */
    293             illegal=1;
    294             break;
    295         }
    296         U_FALLTHROUGH;
    297     case 2:
    298         trail=(uint8_t)(*s++ - 0x80);
    299         if(trail>0x3f) {
    300             /* not a trail byte */
    301             illegal=1;
    302             break;
    303         }
    304         c=(c<<6)|trail;
    305         U_FALLTHROUGH;
    306     case 1:
    307         trail=(uint8_t)(*s++ - 0x80);
    308         if(trail>0x3f) {
    309             /* not a trail byte */
    310             illegal=1;
    311         }
    312         c=(c<<6)|trail;
    313         break;
    314     case 0:
    315         return U_SENTINEL;
    316     /* no default branch to optimize switch()  - all values are covered */
    317     }
    318 
    319     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
    320     /* illegal is also set if count>=4 */
    321     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
    322         /* error handling */
    323         /* don't go beyond this sequence */
    324         s=*ps;
    325         while(count>0 && U8_IS_TRAIL(*s)) {
    326             ++s;
    327             --count;
    328         }
    329         c=U_SENTINEL;
    330     }
    331     *ps=s;
    332     return c;
    333 }
    334 
    335 /*
    336  * Version of utf8_nextCharSafeBody() with the following differences:
    337  * - works with pointers instead of indexes
    338  * - always strict (strict==-1)
    339  *
    340  * *ps points to after the lead byte and will be moved to after the last trail byte.
    341  * c is the lead byte.
    342  * @return the code point, or U_SENTINEL
    343  */
    344 static UChar32
    345 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
    346     const uint8_t *s=*ps;
    347     uint8_t trail, illegal=0;
    348     uint8_t count=U8_COUNT_TRAIL_BYTES(c);
    349     if((limit-s)>=count) {
    350         U8_MASK_LEAD_BYTE((c), count);
    351         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
    352         switch(count) {
    353         /* each branch falls through to the next one */
    354         case 5:
    355         case 4:
    356             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
    357             illegal=1;
    358             break;
    359         case 3:
    360             trail=*s++;
    361             c=(c<<6)|(trail&0x3f);
    362             if(c<0x110) {
    363                 illegal|=(trail&0xc0)^0x80;
    364             } else {
    365                 /* code point>0x10ffff, outside Unicode */
    366                 illegal=1;
    367                 break;
    368             }
    369             U_FALLTHROUGH;
    370         case 2:
    371             trail=*s++;
    372             c=(c<<6)|(trail&0x3f);
    373             illegal|=(trail&0xc0)^0x80;
    374             U_FALLTHROUGH;
    375         case 1:
    376             trail=*s++;
    377             c=(c<<6)|(trail&0x3f);
    378             illegal|=(trail&0xc0)^0x80;
    379             break;
    380         case 0:
    381             return U_SENTINEL;
    382         /* no default branch to optimize switch()  - all values are covered */
    383         }
    384     } else {
    385         illegal=1; /* too few bytes left */
    386     }
    387 
    388     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
    389     /* illegal is also set if count>=4 */
    390     U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal));
    391     if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) {
    392         /* error handling */
    393         /* don't go beyond this sequence */
    394         s=*ps;
    395         while(count>0 && s<limit && U8_IS_TRAIL(*s)) {
    396             ++s;
    397             --count;
    398         }
    399         c=U_SENTINEL;
    400     }
    401     *ps=s;
    402     return c;
    403 }
    404 
    405 U_CAPI UChar* U_EXPORT2
    406 u_strFromUTF8WithSub(UChar *dest,
    407               int32_t destCapacity,
    408               int32_t *pDestLength,
    409               const char* src,
    410               int32_t srcLength,
    411               UChar32 subchar, int32_t *pNumSubstitutions,
    412               UErrorCode *pErrorCode){
    413     UChar *pDest = dest;
    414     UChar *pDestLimit = dest+destCapacity;
    415     UChar32 ch;
    416     int32_t reqLength = 0;
    417     const uint8_t* pSrc = (const uint8_t*) src;
    418     uint8_t t1, t2; /* trail bytes */
    419     int32_t numSubstitutions;
    420 
    421     /* args check */
    422     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
    423         return NULL;
    424     }
    425 
    426     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
    427         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
    428         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    429     ) {
    430         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    431         return NULL;
    432     }
    433 
    434     if(pNumSubstitutions!=NULL) {
    435         *pNumSubstitutions=0;
    436     }
    437     numSubstitutions=0;
    438 
    439     /*
    440      * Inline processing of UTF-8 byte sequences:
    441      *
    442      * Byte sequences for the most common characters are handled inline in
    443      * the conversion loops. In order to reduce the path lengths for those
    444      * characters, the tests are arranged in a kind of binary search.
    445      * ASCII (<=0x7f) is checked first, followed by the dividing point
    446      * between 2- and 3-byte sequences (0xe0).
    447      * The 3-byte branch is tested first to speed up CJK text.
    448      * The compiler should combine the subtractions for the two tests for 0xe0.
    449      * Each branch then tests for the other end of its range.
    450      */
    451 
    452     if(srcLength < 0){
    453         /*
    454          * Transform a NUL-terminated string.
    455          * The code explicitly checks for NULs only in the lead byte position.
    456          * A NUL byte in the trail byte position fails the trail byte range check anyway.
    457          */
    458         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
    459             if(ch <= 0x7f){
    460                 *pDest++=(UChar)ch;
    461                 ++pSrc;
    462             } else {
    463                 if(ch > 0xe0) {
    464                     if( /* handle U+1000..U+CFFF inline */
    465                         ch <= 0xec &&
    466                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
    467                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
    468                     ) {
    469                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    470                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
    471                         pSrc += 3;
    472                         continue;
    473                     }
    474                 } else if(ch < 0xe0) {
    475                     if( /* handle U+0080..U+07FF inline */
    476                         ch >= 0xc2 &&
    477                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
    478                     ) {
    479                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
    480                         pSrc += 2;
    481                         continue;
    482                     }
    483                 }
    484 
    485                 /* function call for "complicated" and error cases */
    486                 ++pSrc; /* continue after the lead byte */
    487                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
    488                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
    489                     *pErrorCode = U_INVALID_CHAR_FOUND;
    490                     return NULL;
    491                 } else if(ch<=0xFFFF) {
    492                     *(pDest++)=(UChar)ch;
    493                 } else {
    494                     *(pDest++)=U16_LEAD(ch);
    495                     if(pDest<pDestLimit) {
    496                         *(pDest++)=U16_TRAIL(ch);
    497                     } else {
    498                         reqLength++;
    499                         break;
    500                     }
    501                 }
    502             }
    503         }
    504 
    505         /* Pre-flight the rest of the string. */
    506         while((ch = *pSrc) != 0) {
    507             if(ch <= 0x7f){
    508                 ++reqLength;
    509                 ++pSrc;
    510             } else {
    511                 if(ch > 0xe0) {
    512                     if( /* handle U+1000..U+CFFF inline */
    513                         ch <= 0xec &&
    514                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
    515                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
    516                     ) {
    517                         ++reqLength;
    518                         pSrc += 3;
    519                         continue;
    520                     }
    521                 } else if(ch < 0xe0) {
    522                     if( /* handle U+0080..U+07FF inline */
    523                         ch >= 0xc2 &&
    524                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
    525                     ) {
    526                         ++reqLength;
    527                         pSrc += 2;
    528                         continue;
    529                     }
    530                 }
    531 
    532                 /* function call for "complicated" and error cases */
    533                 ++pSrc; /* continue after the lead byte */
    534                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
    535                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
    536                     *pErrorCode = U_INVALID_CHAR_FOUND;
    537                     return NULL;
    538                 }
    539                 reqLength += U16_LENGTH(ch);
    540             }
    541         }
    542     } else /* srcLength >= 0 */ {
    543         const uint8_t *pSrcLimit = pSrc + srcLength;
    544         int32_t count;
    545 
    546         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
    547         for(;;) {
    548             /*
    549              * Each iteration of the inner loop progresses by at most 3 UTF-8
    550              * bytes and one UChar, for most characters.
    551              * For supplementary code points (4 & 2), which are rare,
    552              * there is an additional adjustment.
    553              */
    554             count = (int32_t)(pDestLimit - pDest);
    555             srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
    556             if(count > srcLength) {
    557                 count = srcLength; /* min(remaining dest, remaining src/3) */
    558             }
    559             if(count < 3) {
    560                 /*
    561                  * Too much overhead if we get near the end of the string,
    562                  * continue with the next loop.
    563                  */
    564                 break;
    565             }
    566 
    567             do {
    568                 ch = *pSrc;
    569                 if(ch <= 0x7f){
    570                     *pDest++=(UChar)ch;
    571                     ++pSrc;
    572                 } else {
    573                     if(ch > 0xe0) {
    574                         if( /* handle U+1000..U+CFFF inline */
    575                             ch <= 0xec &&
    576                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
    577                             (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
    578                         ) {
    579                             /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    580                             *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
    581                             pSrc += 3;
    582                             continue;
    583                         }
    584                     } else if(ch < 0xe0) {
    585                         if( /* handle U+0080..U+07FF inline */
    586                             ch >= 0xc2 &&
    587                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
    588                         ) {
    589                             *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
    590                             pSrc += 2;
    591                             continue;
    592                         }
    593                     }
    594 
    595                     if(ch >= 0xf0 || subchar > 0xffff) {
    596                         /*
    597                          * We may read up to six bytes and write up to two UChars,
    598                          * which we didn't account for with computing count,
    599                          * so we adjust it here.
    600                          */
    601                         if(--count == 0) {
    602                             break;
    603                         }
    604                     }
    605 
    606                     /* function call for "complicated" and error cases */
    607                     ++pSrc; /* continue after the lead byte */
    608                     ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
    609                     if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
    610                         *pErrorCode = U_INVALID_CHAR_FOUND;
    611                         return NULL;
    612                     }else if(ch<=0xFFFF){
    613                         *(pDest++)=(UChar)ch;
    614                     }else{
    615                         *(pDest++)=U16_LEAD(ch);
    616                         *(pDest++)=U16_TRAIL(ch);
    617                     }
    618                 }
    619             } while(--count > 0);
    620         }
    621 
    622         while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
    623             ch = *pSrc;
    624             if(ch <= 0x7f){
    625                 *pDest++=(UChar)ch;
    626                 ++pSrc;
    627             } else {
    628                 if(ch > 0xe0) {
    629                     if( /* handle U+1000..U+CFFF inline */
    630                         ch <= 0xec &&
    631                         ((pSrcLimit - pSrc) >= 3) &&
    632                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
    633                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
    634                     ) {
    635                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    636                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
    637                         pSrc += 3;
    638                         continue;
    639                     }
    640                 } else if(ch < 0xe0) {
    641                     if( /* handle U+0080..U+07FF inline */
    642                         ch >= 0xc2 &&
    643                         ((pSrcLimit - pSrc) >= 2) &&
    644                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
    645                     ) {
    646                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
    647                         pSrc += 2;
    648                         continue;
    649                     }
    650                 }
    651 
    652                 /* function call for "complicated" and error cases */
    653                 ++pSrc; /* continue after the lead byte */
    654                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
    655                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
    656                     *pErrorCode = U_INVALID_CHAR_FOUND;
    657                     return NULL;
    658                 }else if(ch<=0xFFFF){
    659                     *(pDest++)=(UChar)ch;
    660                 }else{
    661                     *(pDest++)=U16_LEAD(ch);
    662                     if(pDest<pDestLimit){
    663                         *(pDest++)=U16_TRAIL(ch);
    664                     }else{
    665                         reqLength++;
    666                         break;
    667                     }
    668                 }
    669             }
    670         }
    671         /* do not fill the dest buffer just count the UChars needed */
    672         while(pSrc < pSrcLimit){
    673             ch = *pSrc;
    674             if(ch <= 0x7f){
    675                 reqLength++;
    676                 ++pSrc;
    677             } else {
    678                 if(ch > 0xe0) {
    679                     if( /* handle U+1000..U+CFFF inline */
    680                         ch <= 0xec &&
    681                         ((pSrcLimit - pSrc) >= 3) &&
    682                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
    683                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
    684                     ) {
    685                         reqLength++;
    686                         pSrc += 3;
    687                         continue;
    688                     }
    689                 } else if(ch < 0xe0) {
    690                     if( /* handle U+0080..U+07FF inline */
    691                         ch >= 0xc2 &&
    692                         ((pSrcLimit - pSrc) >= 2) &&
    693                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
    694                     ) {
    695                         reqLength++;
    696                         pSrc += 2;
    697                         continue;
    698                     }
    699                 }
    700 
    701                 /* function call for "complicated" and error cases */
    702                 ++pSrc; /* continue after the lead byte */
    703                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
    704                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
    705                     *pErrorCode = U_INVALID_CHAR_FOUND;
    706                     return NULL;
    707                 }
    708                 reqLength+=U16_LENGTH(ch);
    709             }
    710         }
    711     }
    712 
    713     reqLength+=(int32_t)(pDest - dest);
    714 
    715     if(pNumSubstitutions!=NULL) {
    716         *pNumSubstitutions=numSubstitutions;
    717     }
    718 
    719     if(pDestLength){
    720         *pDestLength = reqLength;
    721     }
    722 
    723     /* Terminate the buffer */
    724     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
    725 
    726     return dest;
    727 }
    728 
    729 U_CAPI UChar* U_EXPORT2
    730 u_strFromUTF8(UChar *dest,
    731               int32_t destCapacity,
    732               int32_t *pDestLength,
    733               const char* src,
    734               int32_t srcLength,
    735               UErrorCode *pErrorCode){
    736     return u_strFromUTF8WithSub(
    737             dest, destCapacity, pDestLength,
    738             src, srcLength,
    739             U_SENTINEL, NULL,
    740             pErrorCode);
    741 }
    742 
    743 U_CAPI UChar * U_EXPORT2
    744 u_strFromUTF8Lenient(UChar *dest,
    745                      int32_t destCapacity,
    746                      int32_t *pDestLength,
    747                      const char *src,
    748                      int32_t srcLength,
    749                      UErrorCode *pErrorCode) {
    750     UChar *pDest = dest;
    751     UChar32 ch;
    752     int32_t reqLength = 0;
    753     uint8_t* pSrc = (uint8_t*) src;
    754 
    755     /* args check */
    756     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
    757         return NULL;
    758     }
    759 
    760     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
    761         (destCapacity<0) || (dest == NULL && destCapacity > 0)
    762     ) {
    763         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    764         return NULL;
    765     }
    766 
    767     if(srcLength < 0) {
    768         /* Transform a NUL-terminated string. */
    769         UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
    770         uint8_t t1, t2, t3; /* trail bytes */
    771 
    772         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
    773             if(ch < 0xc0) {
    774                 /*
    775                  * ASCII, or a trail byte in lead position which is treated like
    776                  * a single-byte sequence for better character boundary
    777                  * resynchronization after illegal sequences.
    778                  */
    779                 *pDest++=(UChar)ch;
    780                 ++pSrc;
    781                 continue;
    782             } else if(ch < 0xe0) { /* U+0080..U+07FF */
    783                 if((t1 = pSrc[1]) != 0) {
    784                     /* 0x3080 = (0xc0 << 6) + 0x80 */
    785                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
    786                     pSrc += 2;
    787                     continue;
    788                 }
    789             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    790                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
    791                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    792                     /* 0x2080 = (0x80 << 6) + 0x80 */
    793                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
    794                     pSrc += 3;
    795                     continue;
    796                 }
    797             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    798                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
    799                     pSrc += 4;
    800                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    801                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
    802                     *(pDest++) = U16_LEAD(ch);
    803                     if(pDest < pDestLimit) {
    804                         *(pDest++) = U16_TRAIL(ch);
    805                     } else {
    806                         reqLength = 1;
    807                         break;
    808                     }
    809                     continue;
    810                 }
    811             }
    812 
    813             /* truncated character at the end */
    814             *pDest++ = 0xfffd;
    815             while(*++pSrc != 0) {}
    816             break;
    817         }
    818 
    819         /* Pre-flight the rest of the string. */
    820         while((ch = *pSrc) != 0) {
    821             if(ch < 0xc0) {
    822                 /*
    823                  * ASCII, or a trail byte in lead position which is treated like
    824                  * a single-byte sequence for better character boundary
    825                  * resynchronization after illegal sequences.
    826                  */
    827                 ++reqLength;
    828                 ++pSrc;
    829                 continue;
    830             } else if(ch < 0xe0) { /* U+0080..U+07FF */
    831                 if(pSrc[1] != 0) {
    832                     ++reqLength;
    833                     pSrc += 2;
    834                     continue;
    835                 }
    836             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    837                 if(pSrc[1] != 0 && pSrc[2] != 0) {
    838                     ++reqLength;
    839                     pSrc += 3;
    840                     continue;
    841                 }
    842             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    843                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
    844                     reqLength += 2;
    845                     pSrc += 4;
    846                     continue;
    847                 }
    848             }
    849 
    850             /* truncated character at the end */
    851             ++reqLength;
    852             break;
    853         }
    854     } else /* srcLength >= 0 */ {
    855       const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
    856 
    857         /*
    858          * This function requires that if srcLength is given, then it must be
    859          * destCapatity >= srcLength so that we need not check for
    860          * destination buffer overflow in the loop.
    861          */
    862         if(destCapacity < srcLength) {
    863             if(pDestLength != NULL) {
    864                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
    865             }
    866             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
    867             return NULL;
    868         }
    869 
    870         if((pSrcLimit - pSrc) >= 4) {
    871             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
    872 
    873             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
    874             do {
    875                 ch = *pSrc++;
    876                 if(ch < 0xc0) {
    877                     /*
    878                      * ASCII, or a trail byte in lead position which is treated like
    879                      * a single-byte sequence for better character boundary
    880                      * resynchronization after illegal sequences.
    881                      */
    882                     *pDest++=(UChar)ch;
    883                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
    884                     /* 0x3080 = (0xc0 << 6) + 0x80 */
    885                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
    886                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    887                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    888                     /* 0x2080 = (0x80 << 6) + 0x80 */
    889                     ch = (ch << 12) + (*pSrc++ << 6);
    890                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
    891                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    892                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    893                     ch = (ch << 18) + (*pSrc++ << 12);
    894                     ch += *pSrc++ << 6;
    895                     ch += *pSrc++ - 0x3c82080;
    896                     *(pDest++) = U16_LEAD(ch);
    897                     *(pDest++) = U16_TRAIL(ch);
    898                 }
    899             } while(pSrc < pSrcLimit);
    900 
    901             pSrcLimit += 3; /* restore original pSrcLimit */
    902         }
    903 
    904         while(pSrc < pSrcLimit) {
    905             ch = *pSrc++;
    906             if(ch < 0xc0) {
    907                 /*
    908                  * ASCII, or a trail byte in lead position which is treated like
    909                  * a single-byte sequence for better character boundary
    910                  * resynchronization after illegal sequences.
    911                  */
    912                 *pDest++=(UChar)ch;
    913                 continue;
    914             } else if(ch < 0xe0) { /* U+0080..U+07FF */
    915                 if(pSrc < pSrcLimit) {
    916                     /* 0x3080 = (0xc0 << 6) + 0x80 */
    917                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
    918                     continue;
    919                 }
    920             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    921                 if((pSrcLimit - pSrc) >= 2) {
    922                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    923                     /* 0x2080 = (0x80 << 6) + 0x80 */
    924                     ch = (ch << 12) + (*pSrc++ << 6);
    925                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
    926                     pSrc += 3;
    927                     continue;
    928                 }
    929             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    930                 if((pSrcLimit - pSrc) >= 3) {
    931                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    932                     ch = (ch << 18) + (*pSrc++ << 12);
    933                     ch += *pSrc++ << 6;
    934                     ch += *pSrc++ - 0x3c82080;
    935                     *(pDest++) = U16_LEAD(ch);
    936                     *(pDest++) = U16_TRAIL(ch);
    937                     pSrc += 4;
    938                     continue;
    939                 }
    940             }
    941 
    942             /* truncated character at the end */
    943             *pDest++ = 0xfffd;
    944             break;
    945         }
    946     }
    947 
    948     reqLength+=(int32_t)(pDest - dest);
    949 
    950     if(pDestLength){
    951         *pDestLength = reqLength;
    952     }
    953 
    954     /* Terminate the buffer */
    955     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
    956 
    957     return dest;
    958 }
    959 
    960 static inline uint8_t *
    961 _appendUTF8(uint8_t *pDest, UChar32 c) {
    962     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
    963     if((c)<=0x7f) {
    964         *pDest++=(uint8_t)c;
    965     } else if(c<=0x7ff) {
    966         *pDest++=(uint8_t)((c>>6)|0xc0);
    967         *pDest++=(uint8_t)((c&0x3f)|0x80);
    968     } else if(c<=0xffff) {
    969         *pDest++=(uint8_t)((c>>12)|0xe0);
    970         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
    971         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
    972     } else /* if((uint32_t)(c)<=0x10ffff) */ {
    973         *pDest++=(uint8_t)(((c)>>18)|0xf0);
    974         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
    975         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
    976         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
    977     }
    978     return pDest;
    979 }
    980 
    981 
    982 U_CAPI char* U_EXPORT2
    983 u_strToUTF8WithSub(char *dest,
    984             int32_t destCapacity,
    985             int32_t *pDestLength,
    986             const UChar *pSrc,
    987             int32_t srcLength,
    988             UChar32 subchar, int32_t *pNumSubstitutions,
    989             UErrorCode *pErrorCode){
    990     int32_t reqLength=0;
    991     uint32_t ch=0,ch2=0;
    992     uint8_t *pDest = (uint8_t *)dest;
    993     uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
    994     int32_t numSubstitutions;
    995 
    996     /* args check */
    997     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
    998         return NULL;
    999     }
   1000 
   1001     if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
   1002         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
   1003         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
   1004     ) {
   1005         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1006         return NULL;
   1007     }
   1008 
   1009     if(pNumSubstitutions!=NULL) {
   1010         *pNumSubstitutions=0;
   1011     }
   1012     numSubstitutions=0;
   1013 
   1014     if(srcLength==-1) {
   1015         while((ch=*pSrc)!=0) {
   1016             ++pSrc;
   1017             if(ch <= 0x7f) {
   1018                 if(pDest<pDestLimit) {
   1019                     *pDest++ = (uint8_t)ch;
   1020                 } else {
   1021                     reqLength = 1;
   1022                     break;
   1023                 }
   1024             } else if(ch <= 0x7ff) {
   1025                 if((pDestLimit - pDest) >= 2) {
   1026                     *pDest++=(uint8_t)((ch>>6)|0xc0);
   1027                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1028                 } else {
   1029                     reqLength = 2;
   1030                     break;
   1031                 }
   1032             } else if(ch <= 0xd7ff || ch >= 0xe000) {
   1033                 if((pDestLimit - pDest) >= 3) {
   1034                     *pDest++=(uint8_t)((ch>>12)|0xe0);
   1035                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1036                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1037                 } else {
   1038                     reqLength = 3;
   1039                     break;
   1040                 }
   1041             } else /* ch is a surrogate */ {
   1042                 int32_t length;
   1043 
   1044                 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
   1045                 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
   1046                     ++pSrc;
   1047                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
   1048                 } else if(subchar>=0) {
   1049                     ch=subchar;
   1050                     ++numSubstitutions;
   1051                 } else {
   1052                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1053                     *pErrorCode = U_INVALID_CHAR_FOUND;
   1054                     return NULL;
   1055                 }
   1056 
   1057                 length = U8_LENGTH(ch);
   1058                 if((pDestLimit - pDest) >= length) {
   1059                     /* convert and append*/
   1060                     pDest=_appendUTF8(pDest, ch);
   1061                 } else {
   1062                     reqLength = length;
   1063                     break;
   1064                 }
   1065             }
   1066         }
   1067         while((ch=*pSrc++)!=0) {
   1068             if(ch<=0x7f) {
   1069                 ++reqLength;
   1070             } else if(ch<=0x7ff) {
   1071                 reqLength+=2;
   1072             } else if(!U16_IS_SURROGATE(ch)) {
   1073                 reqLength+=3;
   1074             } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
   1075                 ++pSrc;
   1076                 reqLength+=4;
   1077             } else if(subchar>=0) {
   1078                 reqLength+=U8_LENGTH(subchar);
   1079                 ++numSubstitutions;
   1080             } else {
   1081                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1082                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1083                 return NULL;
   1084             }
   1085         }
   1086     } else {
   1087         const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
   1088         int32_t count;
   1089 
   1090         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
   1091         for(;;) {
   1092             /*
   1093              * Each iteration of the inner loop progresses by at most 3 UTF-8
   1094              * bytes and one UChar, for most characters.
   1095              * For supplementary code points (4 & 2), which are rare,
   1096              * there is an additional adjustment.
   1097              */
   1098             count = (int32_t)((pDestLimit - pDest) / 3);
   1099             srcLength = (int32_t)(pSrcLimit - pSrc);
   1100             if(count > srcLength) {
   1101                 count = srcLength; /* min(remaining dest/3, remaining src) */
   1102             }
   1103             if(count < 3) {
   1104                 /*
   1105                  * Too much overhead if we get near the end of the string,
   1106                  * continue with the next loop.
   1107                  */
   1108                 break;
   1109             }
   1110             do {
   1111                 ch=*pSrc++;
   1112                 if(ch <= 0x7f) {
   1113                     *pDest++ = (uint8_t)ch;
   1114                 } else if(ch <= 0x7ff) {
   1115                     *pDest++=(uint8_t)((ch>>6)|0xc0);
   1116                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1117                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
   1118                     *pDest++=(uint8_t)((ch>>12)|0xe0);
   1119                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1120                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1121                 } else /* ch is a surrogate */ {
   1122                     /*
   1123                      * We will read two UChars and probably output four bytes,
   1124                      * which we didn't account for with computing count,
   1125                      * so we adjust it here.
   1126                      */
   1127                     if(--count == 0) {
   1128                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
   1129                         break;  /* recompute count */
   1130                     }
   1131 
   1132                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
   1133                         ++pSrc;
   1134                         ch=U16_GET_SUPPLEMENTARY(ch, ch2);
   1135 
   1136                         /* writing 4 bytes per 2 UChars is ok */
   1137                         *pDest++=(uint8_t)((ch>>18)|0xf0);
   1138                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
   1139                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1140                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1141                     } else  {
   1142                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1143                         if(subchar>=0) {
   1144                             ch=subchar;
   1145                             ++numSubstitutions;
   1146                         } else {
   1147                             *pErrorCode = U_INVALID_CHAR_FOUND;
   1148                             return NULL;
   1149                         }
   1150 
   1151                         /* convert and append*/
   1152                         pDest=_appendUTF8(pDest, ch);
   1153                     }
   1154                 }
   1155             } while(--count > 0);
   1156         }
   1157 
   1158         while(pSrc<pSrcLimit) {
   1159             ch=*pSrc++;
   1160             if(ch <= 0x7f) {
   1161                 if(pDest<pDestLimit) {
   1162                     *pDest++ = (uint8_t)ch;
   1163                 } else {
   1164                     reqLength = 1;
   1165                     break;
   1166                 }
   1167             } else if(ch <= 0x7ff) {
   1168                 if((pDestLimit - pDest) >= 2) {
   1169                     *pDest++=(uint8_t)((ch>>6)|0xc0);
   1170                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1171                 } else {
   1172                     reqLength = 2;
   1173                     break;
   1174                 }
   1175             } else if(ch <= 0xd7ff || ch >= 0xe000) {
   1176                 if((pDestLimit - pDest) >= 3) {
   1177                     *pDest++=(uint8_t)((ch>>12)|0xe0);
   1178                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1179                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1180                 } else {
   1181                     reqLength = 3;
   1182                     break;
   1183                 }
   1184             } else /* ch is a surrogate */ {
   1185                 int32_t length;
   1186 
   1187                 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
   1188                     ++pSrc;
   1189                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
   1190                 } else if(subchar>=0) {
   1191                     ch=subchar;
   1192                     ++numSubstitutions;
   1193                 } else {
   1194                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1195                     *pErrorCode = U_INVALID_CHAR_FOUND;
   1196                     return NULL;
   1197                 }
   1198 
   1199                 length = U8_LENGTH(ch);
   1200                 if((pDestLimit - pDest) >= length) {
   1201                     /* convert and append*/
   1202                     pDest=_appendUTF8(pDest, ch);
   1203                 } else {
   1204                     reqLength = length;
   1205                     break;
   1206                 }
   1207             }
   1208         }
   1209         while(pSrc<pSrcLimit) {
   1210             ch=*pSrc++;
   1211             if(ch<=0x7f) {
   1212                 ++reqLength;
   1213             } else if(ch<=0x7ff) {
   1214                 reqLength+=2;
   1215             } else if(!U16_IS_SURROGATE(ch)) {
   1216                 reqLength+=3;
   1217             } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
   1218                 ++pSrc;
   1219                 reqLength+=4;
   1220             } else if(subchar>=0) {
   1221                 reqLength+=U8_LENGTH(subchar);
   1222                 ++numSubstitutions;
   1223             } else {
   1224                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1225                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1226                 return NULL;
   1227             }
   1228         }
   1229     }
   1230 
   1231     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
   1232 
   1233     if(pNumSubstitutions!=NULL) {
   1234         *pNumSubstitutions=numSubstitutions;
   1235     }
   1236 
   1237     if(pDestLength){
   1238         *pDestLength = reqLength;
   1239     }
   1240 
   1241     /* Terminate the buffer */
   1242     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1243     return dest;
   1244 }
   1245 
   1246 U_CAPI char* U_EXPORT2
   1247 u_strToUTF8(char *dest,
   1248             int32_t destCapacity,
   1249             int32_t *pDestLength,
   1250             const UChar *pSrc,
   1251             int32_t srcLength,
   1252             UErrorCode *pErrorCode){
   1253     return u_strToUTF8WithSub(
   1254             dest, destCapacity, pDestLength,
   1255             pSrc, srcLength,
   1256             U_SENTINEL, NULL,
   1257             pErrorCode);
   1258 }
   1259 
   1260 U_CAPI UChar* U_EXPORT2
   1261 u_strFromJavaModifiedUTF8WithSub(
   1262         UChar *dest,
   1263         int32_t destCapacity,
   1264         int32_t *pDestLength,
   1265         const char *src,
   1266         int32_t srcLength,
   1267         UChar32 subchar, int32_t *pNumSubstitutions,
   1268         UErrorCode *pErrorCode) {
   1269     UChar *pDest = dest;
   1270     UChar *pDestLimit = dest+destCapacity;
   1271     UChar32 ch;
   1272     int32_t reqLength = 0;
   1273     const uint8_t* pSrc = (const uint8_t*) src;
   1274     const uint8_t *pSrcLimit;
   1275     int32_t count;
   1276     uint8_t t1, t2; /* trail bytes */
   1277     int32_t numSubstitutions;
   1278 
   1279     /* args check */
   1280     if(U_FAILURE(*pErrorCode)){
   1281         return NULL;
   1282     }
   1283     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
   1284         (dest==NULL && destCapacity!=0) || destCapacity<0 ||
   1285         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
   1286     ) {
   1287         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1288         return NULL;
   1289     }
   1290 
   1291     if(pNumSubstitutions!=NULL) {
   1292         *pNumSubstitutions=0;
   1293     }
   1294     numSubstitutions=0;
   1295 
   1296     if(srcLength < 0) {
   1297         /*
   1298          * Transform a NUL-terminated ASCII string.
   1299          * Handle non-ASCII strings with slower code.
   1300          */
   1301         while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
   1302             *pDest++=(UChar)ch;
   1303             ++pSrc;
   1304         }
   1305         if(ch == 0) {
   1306             reqLength=(int32_t)(pDest - dest);
   1307             if(pDestLength) {
   1308                 *pDestLength = reqLength;
   1309             }
   1310 
   1311             /* Terminate the buffer */
   1312             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
   1313             return dest;
   1314         }
   1315         srcLength = uprv_strlen((const char *)pSrc);
   1316     }
   1317 
   1318     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
   1319     pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength;
   1320     for(;;) {
   1321         count = (int32_t)(pDestLimit - pDest);
   1322         srcLength = (int32_t)(pSrcLimit - pSrc);
   1323         if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
   1324             /* fast ASCII loop */
   1325             const uint8_t *prevSrc = pSrc;
   1326             int32_t delta;
   1327             while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
   1328                 *pDest++=(UChar)ch;
   1329                 ++pSrc;
   1330             }
   1331             delta = (int32_t)(pSrc - prevSrc);
   1332             count -= delta;
   1333             srcLength -= delta;
   1334         }
   1335         /*
   1336          * Each iteration of the inner loop progresses by at most 3 UTF-8
   1337          * bytes and one UChar.
   1338          */
   1339         srcLength /= 3;
   1340         if(count > srcLength) {
   1341             count = srcLength; /* min(remaining dest, remaining src/3) */
   1342         }
   1343         if(count < 3) {
   1344             /*
   1345              * Too much overhead if we get near the end of the string,
   1346              * continue with the next loop.
   1347              */
   1348             break;
   1349         }
   1350         do {
   1351             ch = *pSrc;
   1352             if(ch <= 0x7f){
   1353                 *pDest++=(UChar)ch;
   1354                 ++pSrc;
   1355             } else {
   1356                 if(ch >= 0xe0) {
   1357                     if( /* handle U+0000..U+FFFF inline */
   1358                         ch <= 0xef &&
   1359                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
   1360                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
   1361                     ) {
   1362                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
   1363                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
   1364                         pSrc += 3;
   1365                         continue;
   1366                     }
   1367                 } else {
   1368                     if( /* handle U+0000..U+07FF inline */
   1369                         ch >= 0xc0 &&
   1370                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
   1371                     ) {
   1372                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
   1373                         pSrc += 2;
   1374                         continue;
   1375                     }
   1376                 }
   1377 
   1378                 if(subchar < 0) {
   1379                     *pErrorCode = U_INVALID_CHAR_FOUND;
   1380                     return NULL;
   1381                 } else if(subchar > 0xffff && --count == 0) {
   1382                     /*
   1383                      * We need to write two UChars, adjusted count for that,
   1384                      * and ran out of space.
   1385                      */
   1386                     break;
   1387                 } else {
   1388                     /* function call for error cases */
   1389                     ++pSrc; /* continue after the lead byte */
   1390                     utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
   1391                     ++numSubstitutions;
   1392                     if(subchar<=0xFFFF) {
   1393                         *(pDest++)=(UChar)subchar;
   1394                     } else {
   1395                         *(pDest++)=U16_LEAD(subchar);
   1396                         *(pDest++)=U16_TRAIL(subchar);
   1397                     }
   1398                 }
   1399             }
   1400         } while(--count > 0);
   1401     }
   1402 
   1403     while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
   1404         ch = *pSrc;
   1405         if(ch <= 0x7f){
   1406             *pDest++=(UChar)ch;
   1407             ++pSrc;
   1408         } else {
   1409             if(ch >= 0xe0) {
   1410                 if( /* handle U+0000..U+FFFF inline */
   1411                     ch <= 0xef &&
   1412                     ((pSrcLimit - pSrc) >= 3) &&
   1413                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
   1414                     (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
   1415                 ) {
   1416                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
   1417                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
   1418                     pSrc += 3;
   1419                     continue;
   1420                 }
   1421             } else {
   1422                 if( /* handle U+0000..U+07FF inline */
   1423                     ch >= 0xc0 &&
   1424                     ((pSrcLimit - pSrc) >= 2) &&
   1425                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
   1426                 ) {
   1427                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
   1428                     pSrc += 2;
   1429                     continue;
   1430                 }
   1431             }
   1432 
   1433             if(subchar < 0) {
   1434                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1435                 return NULL;
   1436             } else {
   1437                 /* function call for error cases */
   1438                 ++pSrc; /* continue after the lead byte */
   1439                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
   1440                 ++numSubstitutions;
   1441                 if(subchar<=0xFFFF) {
   1442                     *(pDest++)=(UChar)subchar;
   1443                 } else {
   1444                     *(pDest++)=U16_LEAD(subchar);
   1445                     if(pDest<pDestLimit) {
   1446                         *(pDest++)=U16_TRAIL(subchar);
   1447                     } else {
   1448                         reqLength++;
   1449                         break;
   1450                     }
   1451                 }
   1452             }
   1453         }
   1454     }
   1455 
   1456     /* do not fill the dest buffer just count the UChars needed */
   1457     while(pSrc < pSrcLimit){
   1458         ch = *pSrc;
   1459         if(ch <= 0x7f) {
   1460             reqLength++;
   1461             ++pSrc;
   1462         } else {
   1463             if(ch >= 0xe0) {
   1464                 if( /* handle U+0000..U+FFFF inline */
   1465                     ch <= 0xef &&
   1466                     ((pSrcLimit - pSrc) >= 3) &&
   1467                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
   1468                     (uint8_t)(pSrc[2] - 0x80) <= 0x3f
   1469                 ) {
   1470                     reqLength++;
   1471                     pSrc += 3;
   1472                     continue;
   1473                 }
   1474             } else {
   1475                 if( /* handle U+0000..U+07FF inline */
   1476                     ch >= 0xc0 &&
   1477                     ((pSrcLimit - pSrc) >= 2) &&
   1478                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f
   1479                 ) {
   1480                     reqLength++;
   1481                     pSrc += 2;
   1482                     continue;
   1483                 }
   1484             }
   1485 
   1486             if(subchar < 0) {
   1487                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1488                 return NULL;
   1489             } else {
   1490                 /* function call for error cases */
   1491                 ++pSrc; /* continue after the lead byte */
   1492                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
   1493                 ++numSubstitutions;
   1494                 reqLength+=U16_LENGTH(ch);
   1495             }
   1496         }
   1497     }
   1498 
   1499     if(pNumSubstitutions!=NULL) {
   1500         *pNumSubstitutions=numSubstitutions;
   1501     }
   1502 
   1503     reqLength+=(int32_t)(pDest - dest);
   1504     if(pDestLength) {
   1505         *pDestLength = reqLength;
   1506     }
   1507 
   1508     /* Terminate the buffer */
   1509     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
   1510     return dest;
   1511 }
   1512 
   1513 U_CAPI char* U_EXPORT2
   1514 u_strToJavaModifiedUTF8(
   1515         char *dest,
   1516         int32_t destCapacity,
   1517         int32_t *pDestLength,
   1518         const UChar *src,
   1519         int32_t srcLength,
   1520         UErrorCode *pErrorCode) {
   1521     int32_t reqLength=0;
   1522     uint32_t ch=0;
   1523     uint8_t *pDest = (uint8_t *)dest;
   1524     uint8_t *pDestLimit = pDest + destCapacity;
   1525     const UChar *pSrcLimit;
   1526     int32_t count;
   1527 
   1528     /* args check */
   1529     if(U_FAILURE(*pErrorCode)){
   1530         return NULL;
   1531     }
   1532     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
   1533         (dest==NULL && destCapacity!=0) || destCapacity<0
   1534     ) {
   1535         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1536         return NULL;
   1537     }
   1538 
   1539     if(srcLength==-1) {
   1540         /* Convert NUL-terminated ASCII, then find the string length. */
   1541         while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
   1542             *pDest++ = (uint8_t)ch;
   1543             ++src;
   1544         }
   1545         if(ch == 0) {
   1546             reqLength=(int32_t)(pDest - (uint8_t *)dest);
   1547             if(pDestLength) {
   1548                 *pDestLength = reqLength;
   1549             }
   1550 
   1551             /* Terminate the buffer */
   1552             u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1553             return dest;
   1554         }
   1555         srcLength = u_strlen(src);
   1556     }
   1557 
   1558     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
   1559     pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
   1560     for(;;) {
   1561         count = (int32_t)(pDestLimit - pDest);
   1562         srcLength = (int32_t)(pSrcLimit - src);
   1563         if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
   1564             /* fast ASCII loop */
   1565             const UChar *prevSrc = src;
   1566             int32_t delta;
   1567             while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
   1568                 *pDest++=(uint8_t)ch;
   1569                 ++src;
   1570             }
   1571             delta = (int32_t)(src - prevSrc);
   1572             count -= delta;
   1573             srcLength -= delta;
   1574         }
   1575         /*
   1576          * Each iteration of the inner loop progresses by at most 3 UTF-8
   1577          * bytes and one UChar.
   1578          */
   1579         count /= 3;
   1580         if(count > srcLength) {
   1581             count = srcLength; /* min(remaining dest/3, remaining src) */
   1582         }
   1583         if(count < 3) {
   1584             /*
   1585              * Too much overhead if we get near the end of the string,
   1586              * continue with the next loop.
   1587              */
   1588             break;
   1589         }
   1590         do {
   1591             ch=*src++;
   1592             if(ch <= 0x7f && ch != 0) {
   1593                 *pDest++ = (uint8_t)ch;
   1594             } else if(ch <= 0x7ff) {
   1595                 *pDest++=(uint8_t)((ch>>6)|0xc0);
   1596                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1597             } else {
   1598                 *pDest++=(uint8_t)((ch>>12)|0xe0);
   1599                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1600                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1601             }
   1602         } while(--count > 0);
   1603     }
   1604 
   1605     while(src<pSrcLimit) {
   1606         ch=*src++;
   1607         if(ch <= 0x7f && ch != 0) {
   1608             if(pDest<pDestLimit) {
   1609                 *pDest++ = (uint8_t)ch;
   1610             } else {
   1611                 reqLength = 1;
   1612                 break;
   1613             }
   1614         } else if(ch <= 0x7ff) {
   1615             if((pDestLimit - pDest) >= 2) {
   1616                 *pDest++=(uint8_t)((ch>>6)|0xc0);
   1617                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1618             } else {
   1619                 reqLength = 2;
   1620                 break;
   1621             }
   1622         } else {
   1623             if((pDestLimit - pDest) >= 3) {
   1624                 *pDest++=(uint8_t)((ch>>12)|0xe0);
   1625                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1626                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1627             } else {
   1628                 reqLength = 3;
   1629                 break;
   1630             }
   1631         }
   1632     }
   1633     while(src<pSrcLimit) {
   1634         ch=*src++;
   1635         if(ch <= 0x7f && ch != 0) {
   1636             ++reqLength;
   1637         } else if(ch<=0x7ff) {
   1638             reqLength+=2;
   1639         } else {
   1640             reqLength+=3;
   1641         }
   1642     }
   1643 
   1644     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
   1645     if(pDestLength){
   1646         *pDestLength = reqLength;
   1647     }
   1648 
   1649     /* Terminate the buffer */
   1650     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1651     return dest;
   1652 }
   1653