Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2001-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *
     11 * File ustrtrns.cpp
     12 *
     13 * Modification History:
     14 *
     15 *   Date        Name        Description
     16 *   9/10/2001    Ram    Creation.
     17 ******************************************************************************
     18 */
     19 
     20 /*******************************************************************************
     21  *
     22  * u_strTo* and u_strFrom* APIs
     23  * WCS functions moved to ustr_wcs.c for better modularization
     24  *
     25  *******************************************************************************
     26  */
     27 
     28 
     29 #include "unicode/putil.h"
     30 #include "unicode/ustring.h"
     31 #include "unicode/utf.h"
     32 #include "unicode/utf8.h"
     33 #include "unicode/utf16.h"
     34 #include "cstring.h"
     35 #include "cmemory.h"
     36 #include "ustr_imp.h"
     37 #include "uassert.h"
     38 
     39 U_CAPI UChar* U_EXPORT2
     40 u_strFromUTF32WithSub(UChar *dest,
     41                int32_t destCapacity,
     42                int32_t *pDestLength,
     43                const UChar32 *src,
     44                int32_t srcLength,
     45                UChar32 subchar, int32_t *pNumSubstitutions,
     46                UErrorCode *pErrorCode) {
     47     const UChar32 *srcLimit;
     48     UChar32 ch;
     49     UChar *destLimit;
     50     UChar *pDest;
     51     int32_t reqLength;
     52     int32_t numSubstitutions;
     53 
     54     /* args check */
     55     if(U_FAILURE(*pErrorCode)){
     56         return NULL;
     57     }
     58     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
     59         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
     60         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
     61     ) {
     62         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
     63         return NULL;
     64     }
     65 
     66     if(pNumSubstitutions != NULL) {
     67         *pNumSubstitutions = 0;
     68     }
     69 
     70     pDest = dest;
     71     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
     72     reqLength = 0;
     73     numSubstitutions = 0;
     74 
     75     if(srcLength < 0) {
     76         /* simple loop for conversion of a NUL-terminated BMP string */
     77         while((ch=*src) != 0 &&
     78               ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
     79             ++src;
     80             if(pDest < destLimit) {
     81                 *pDest++ = (UChar)ch;
     82             } else {
     83                 ++reqLength;
     84             }
     85         }
     86         srcLimit = src;
     87         if(ch != 0) {
     88             /* "complicated" case, find the end of the remaining string */
     89             while(*++srcLimit != 0) {}
     90         }
     91     } else {
     92       srcLimit = (src!=NULL)?(src + srcLength):NULL;
     93     }
     94 
     95     /* convert with length */
     96     while(src < srcLimit) {
     97         ch = *src++;
     98         do {
     99             /* usually "loops" once; twice only for writing subchar */
    100             if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
    101                 if(pDest < destLimit) {
    102                     *pDest++ = (UChar)ch;
    103                 } else {
    104                     ++reqLength;
    105                 }
    106                 break;
    107             } else if(0x10000 <= ch && ch <= 0x10ffff) {
    108                 if(pDest!=NULL && ((pDest + 2) <= destLimit)) {
    109                     *pDest++ = U16_LEAD(ch);
    110                     *pDest++ = U16_TRAIL(ch);
    111                 } else {
    112                     reqLength += 2;
    113                 }
    114                 break;
    115             } else if((ch = subchar) < 0) {
    116                 /* surrogate code point, or not a Unicode code point at all */
    117                 *pErrorCode = U_INVALID_CHAR_FOUND;
    118                 return NULL;
    119             } else {
    120                 ++numSubstitutions;
    121             }
    122         } while(TRUE);
    123     }
    124 
    125     reqLength += (int32_t)(pDest - dest);
    126     if(pDestLength) {
    127         *pDestLength = reqLength;
    128     }
    129     if(pNumSubstitutions != NULL) {
    130         *pNumSubstitutions = numSubstitutions;
    131     }
    132 
    133     /* Terminate the buffer */
    134     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
    135 
    136     return dest;
    137 }
    138 
    139 U_CAPI UChar* U_EXPORT2
    140 u_strFromUTF32(UChar *dest,
    141                int32_t destCapacity,
    142                int32_t *pDestLength,
    143                const UChar32 *src,
    144                int32_t srcLength,
    145                UErrorCode *pErrorCode) {
    146     return u_strFromUTF32WithSub(
    147             dest, destCapacity, pDestLength,
    148             src, srcLength,
    149             U_SENTINEL, NULL,
    150             pErrorCode);
    151 }
    152 
    153 U_CAPI UChar32* U_EXPORT2
    154 u_strToUTF32WithSub(UChar32 *dest,
    155              int32_t destCapacity,
    156              int32_t *pDestLength,
    157              const UChar *src,
    158              int32_t srcLength,
    159              UChar32 subchar, int32_t *pNumSubstitutions,
    160              UErrorCode *pErrorCode) {
    161     const UChar *srcLimit;
    162     UChar32 ch;
    163     UChar ch2;
    164     UChar32 *destLimit;
    165     UChar32 *pDest;
    166     int32_t reqLength;
    167     int32_t numSubstitutions;
    168 
    169     /* args check */
    170     if(U_FAILURE(*pErrorCode)){
    171         return NULL;
    172     }
    173     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
    174         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
    175         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    176     ) {
    177         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    178         return NULL;
    179     }
    180 
    181     if(pNumSubstitutions != NULL) {
    182         *pNumSubstitutions = 0;
    183     }
    184 
    185     pDest = dest;
    186     destLimit = (dest!=NULL)?(dest + destCapacity):NULL;
    187     reqLength = 0;
    188     numSubstitutions = 0;
    189 
    190     if(srcLength < 0) {
    191         /* simple loop for conversion of a NUL-terminated BMP string */
    192         while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
    193             ++src;
    194             if(pDest < destLimit) {
    195                 *pDest++ = ch;
    196             } else {
    197                 ++reqLength;
    198             }
    199         }
    200         srcLimit = src;
    201         if(ch != 0) {
    202             /* "complicated" case, find the end of the remaining string */
    203             while(*++srcLimit != 0) {}
    204         }
    205     } else {
    206         srcLimit = (src!=NULL)?(src + srcLength):NULL;
    207     }
    208 
    209     /* convert with length */
    210     while(src < srcLimit) {
    211         ch = *src++;
    212         if(!U16_IS_SURROGATE(ch)) {
    213             /* write or count ch below */
    214         } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
    215             ++src;
    216             ch = U16_GET_SUPPLEMENTARY(ch, ch2);
    217         } else if((ch = subchar) < 0) {
    218             /* unpaired surrogate */
    219             *pErrorCode = U_INVALID_CHAR_FOUND;
    220             return NULL;
    221         } else {
    222             ++numSubstitutions;
    223         }
    224         if(pDest < destLimit) {
    225             *pDest++ = ch;
    226         } else {
    227             ++reqLength;
    228         }
    229     }
    230 
    231     reqLength += (int32_t)(pDest - dest);
    232     if(pDestLength) {
    233         *pDestLength = reqLength;
    234     }
    235     if(pNumSubstitutions != NULL) {
    236         *pNumSubstitutions = numSubstitutions;
    237     }
    238 
    239     /* Terminate the buffer */
    240     u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
    241 
    242     return dest;
    243 }
    244 
    245 U_CAPI UChar32* U_EXPORT2
    246 u_strToUTF32(UChar32 *dest,
    247              int32_t destCapacity,
    248              int32_t *pDestLength,
    249              const UChar *src,
    250              int32_t srcLength,
    251              UErrorCode *pErrorCode) {
    252     return u_strToUTF32WithSub(
    253             dest, destCapacity, pDestLength,
    254             src, srcLength,
    255             U_SENTINEL, NULL,
    256             pErrorCode);
    257 }
    258 
    259 U_CAPI UChar* U_EXPORT2
    260 u_strFromUTF8WithSub(UChar *dest,
    261               int32_t destCapacity,
    262               int32_t *pDestLength,
    263               const char* src,
    264               int32_t srcLength,
    265               UChar32 subchar, int32_t *pNumSubstitutions,
    266               UErrorCode *pErrorCode){
    267     /* args check */
    268     if(U_FAILURE(*pErrorCode)) {
    269         return NULL;
    270     }
    271     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
    272         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
    273         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    274     ) {
    275         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    276         return NULL;
    277     }
    278 
    279     if(pNumSubstitutions!=NULL) {
    280         *pNumSubstitutions=0;
    281     }
    282     UChar *pDest = dest;
    283     UChar *pDestLimit = dest+destCapacity;
    284     int32_t reqLength = 0;
    285     int32_t numSubstitutions=0;
    286 
    287     /*
    288      * Inline processing of UTF-8 byte sequences:
    289      *
    290      * Byte sequences for the most common characters are handled inline in
    291      * the conversion loops. In order to reduce the path lengths for those
    292      * characters, the tests are arranged in a kind of binary search.
    293      * ASCII (<=0x7f) is checked first, followed by the dividing point
    294      * between 2- and 3-byte sequences (0xe0).
    295      * The 3-byte branch is tested first to speed up CJK text.
    296      * The compiler should combine the subtractions for the two tests for 0xe0.
    297      * Each branch then tests for the other end of its range.
    298      */
    299 
    300     if(srcLength < 0){
    301         /*
    302          * Transform a NUL-terminated string.
    303          * The code explicitly checks for NULs only in the lead byte position.
    304          * A NUL byte in the trail byte position fails the trail byte range check anyway.
    305          */
    306         int32_t i;
    307         UChar32 c;
    308         for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) {
    309             // modified copy of U8_NEXT()
    310             ++i;
    311             if(U8_IS_SINGLE(c)) {
    312                 *pDest++=(UChar)c;
    313             } else {
    314                 uint8_t __t1, __t2;
    315                 if( /* handle U+0800..U+FFFF inline */
    316                         (0xe0<=(c) && (c)<0xf0) &&
    317                         U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
    318                         (__t2=src[(i)+1]-0x80)<=0x3f) {
    319                     *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
    320                     i+=2;
    321                 } else if( /* handle U+0080..U+07FF inline */
    322                         ((c)<0xe0 && (c)>=0xc2) &&
    323                         (__t1=src[i]-0x80)<=0x3f) {
    324                     *pDest++ = (((c)&0x1f)<<6)|__t1;
    325                     ++(i);
    326                 } else {
    327                     /* function call for "complicated" and error cases */
    328                     (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
    329                     if(c<0 && (++numSubstitutions, c = subchar) < 0) {
    330                         *pErrorCode = U_INVALID_CHAR_FOUND;
    331                         return NULL;
    332                     } else if(c<=0xFFFF) {
    333                         *(pDest++)=(UChar)c;
    334                     } else {
    335                         *(pDest++)=U16_LEAD(c);
    336                         if(pDest<pDestLimit) {
    337                             *(pDest++)=U16_TRAIL(c);
    338                         } else {
    339                             reqLength++;
    340                             break;
    341                         }
    342                     }
    343                 }
    344             }
    345         }
    346 
    347         /* Pre-flight the rest of the string. */
    348         while((c = (uint8_t)src[i]) != 0) {
    349             // modified copy of U8_NEXT()
    350             ++i;
    351             if(U8_IS_SINGLE(c)) {
    352                 ++reqLength;
    353             } else {
    354                 uint8_t __t1, __t2;
    355                 if( /* handle U+0800..U+FFFF inline */
    356                         (0xe0<=(c) && (c)<0xf0) &&
    357                         U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
    358                         (__t2=src[(i)+1]-0x80)<=0x3f) {
    359                     ++reqLength;
    360                     i+=2;
    361                 } else if( /* handle U+0080..U+07FF inline */
    362                         ((c)<0xe0 && (c)>=0xc2) &&
    363                         (__t1=src[i]-0x80)<=0x3f) {
    364                     ++reqLength;
    365                     ++(i);
    366                 } else {
    367                     /* function call for "complicated" and error cases */
    368                     (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1);
    369                     if(c<0 && (++numSubstitutions, c = subchar) < 0) {
    370                         *pErrorCode = U_INVALID_CHAR_FOUND;
    371                         return NULL;
    372                     }
    373                     reqLength += U16_LENGTH(c);
    374                 }
    375             }
    376         }
    377     } else /* srcLength >= 0 */ {
    378         /* Faster loop without ongoing checking for srcLength and pDestLimit. */
    379         int32_t i = 0;
    380         UChar32 c;
    381         for(;;) {
    382             /*
    383              * Each iteration of the inner loop progresses by at most 3 UTF-8
    384              * bytes and one UChar, for most characters.
    385              * For supplementary code points (4 & 2), which are rare,
    386              * there is an additional adjustment.
    387              */
    388             int32_t count = (int32_t)(pDestLimit - pDest);
    389             int32_t count2 = (srcLength - i) / 3;
    390             if(count > count2) {
    391                 count = count2; /* min(remaining dest, remaining src/3) */
    392             }
    393             if(count < 3) {
    394                 /*
    395                  * Too much overhead if we get near the end of the string,
    396                  * continue with the next loop.
    397                  */
    398                 break;
    399             }
    400 
    401             do {
    402                 // modified copy of U8_NEXT()
    403                 c = (uint8_t)src[i++];
    404                 if(U8_IS_SINGLE(c)) {
    405                     *pDest++=(UChar)c;
    406                 } else {
    407                     uint8_t __t1, __t2;
    408                     if( /* handle U+0800..U+FFFF inline */
    409                             (0xe0<=(c) && (c)<0xf0) &&
    410                             ((i)+1)<srcLength &&
    411                             U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
    412                             (__t2=src[(i)+1]-0x80)<=0x3f) {
    413                         *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
    414                         i+=2;
    415                     } else if( /* handle U+0080..U+07FF inline */
    416                             ((c)<0xe0 && (c)>=0xc2) &&
    417                             ((i)!=srcLength) &&
    418                             (__t1=src[i]-0x80)<=0x3f) {
    419                         *pDest++ = (((c)&0x1f)<<6)|__t1;
    420                         ++(i);
    421                     } else {
    422                         if(c >= 0xf0 || subchar > 0xffff) {
    423                             // We may read up to four bytes and write up to two UChars,
    424                             // which we didn't account for with computing count,
    425                             // so we adjust it here.
    426                             if(--count == 0) {
    427                                 --i;  // back out byte c
    428                                 break;
    429                             }
    430                         }
    431 
    432                         /* function call for "complicated" and error cases */
    433                         (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
    434                         if(c<0 && (++numSubstitutions, c = subchar) < 0) {
    435                             *pErrorCode = U_INVALID_CHAR_FOUND;
    436                             return NULL;
    437                         } else if(c<=0xFFFF) {
    438                             *(pDest++)=(UChar)c;
    439                         } else {
    440                             *(pDest++)=U16_LEAD(c);
    441                             *(pDest++)=U16_TRAIL(c);
    442                         }
    443                     }
    444                 }
    445             } while(--count > 0);
    446         }
    447 
    448         while(i < srcLength && (pDest < pDestLimit)) {
    449             // modified copy of U8_NEXT()
    450             c = (uint8_t)src[i++];
    451             if(U8_IS_SINGLE(c)) {
    452                 *pDest++=(UChar)c;
    453             } else {
    454                 uint8_t __t1, __t2;
    455                 if( /* handle U+0800..U+FFFF inline */
    456                         (0xe0<=(c) && (c)<0xf0) &&
    457                         ((i)+1)<srcLength &&
    458                         U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
    459                         (__t2=src[(i)+1]-0x80)<=0x3f) {
    460                     *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2;
    461                     i+=2;
    462                 } else if( /* handle U+0080..U+07FF inline */
    463                         ((c)<0xe0 && (c)>=0xc2) &&
    464                         ((i)!=srcLength) &&
    465                         (__t1=src[i]-0x80)<=0x3f) {
    466                     *pDest++ = (((c)&0x1f)<<6)|__t1;
    467                     ++(i);
    468                 } else {
    469                     /* function call for "complicated" and error cases */
    470                     (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
    471                     if(c<0 && (++numSubstitutions, c = subchar) < 0) {
    472                         *pErrorCode = U_INVALID_CHAR_FOUND;
    473                         return NULL;
    474                     } else if(c<=0xFFFF) {
    475                         *(pDest++)=(UChar)c;
    476                     } else {
    477                         *(pDest++)=U16_LEAD(c);
    478                         if(pDest<pDestLimit) {
    479                             *(pDest++)=U16_TRAIL(c);
    480                         } else {
    481                             reqLength++;
    482                             break;
    483                         }
    484                     }
    485                 }
    486             }
    487         }
    488 
    489         /* Pre-flight the rest of the string. */
    490         while(i < srcLength) {
    491             // modified copy of U8_NEXT()
    492             c = (uint8_t)src[i++];
    493             if(U8_IS_SINGLE(c)) {
    494                 ++reqLength;
    495             } else {
    496                 uint8_t __t1, __t2;
    497                 if( /* handle U+0800..U+FFFF inline */
    498                         (0xe0<=(c) && (c)<0xf0) &&
    499                         ((i)+1)<srcLength &&
    500                         U8_IS_VALID_LEAD3_AND_T1((c), src[i]) &&
    501                         (__t2=src[(i)+1]-0x80)<=0x3f) {
    502                     ++reqLength;
    503                     i+=2;
    504                 } else if( /* handle U+0080..U+07FF inline */
    505                         ((c)<0xe0 && (c)>=0xc2) &&
    506                         ((i)!=srcLength) &&
    507                         (__t1=src[i]-0x80)<=0x3f) {
    508                     ++reqLength;
    509                     ++(i);
    510                 } else {
    511                     /* function call for "complicated" and error cases */
    512                     (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1);
    513                     if(c<0 && (++numSubstitutions, c = subchar) < 0) {
    514                         *pErrorCode = U_INVALID_CHAR_FOUND;
    515                         return NULL;
    516                     }
    517                     reqLength += U16_LENGTH(c);
    518                 }
    519             }
    520         }
    521     }
    522 
    523     reqLength+=(int32_t)(pDest - dest);
    524 
    525     if(pNumSubstitutions!=NULL) {
    526         *pNumSubstitutions=numSubstitutions;
    527     }
    528 
    529     if(pDestLength){
    530         *pDestLength = reqLength;
    531     }
    532 
    533     /* Terminate the buffer */
    534     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
    535 
    536     return dest;
    537 }
    538 
    539 U_CAPI UChar* U_EXPORT2
    540 u_strFromUTF8(UChar *dest,
    541               int32_t destCapacity,
    542               int32_t *pDestLength,
    543               const char* src,
    544               int32_t srcLength,
    545               UErrorCode *pErrorCode){
    546     return u_strFromUTF8WithSub(
    547             dest, destCapacity, pDestLength,
    548             src, srcLength,
    549             U_SENTINEL, NULL,
    550             pErrorCode);
    551 }
    552 
    553 U_CAPI UChar * U_EXPORT2
    554 u_strFromUTF8Lenient(UChar *dest,
    555                      int32_t destCapacity,
    556                      int32_t *pDestLength,
    557                      const char *src,
    558                      int32_t srcLength,
    559                      UErrorCode *pErrorCode) {
    560     UChar *pDest = dest;
    561     UChar32 ch;
    562     int32_t reqLength = 0;
    563     uint8_t* pSrc = (uint8_t*) src;
    564 
    565     /* args check */
    566     if(U_FAILURE(*pErrorCode)){
    567         return NULL;
    568     }
    569 
    570     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
    571         (destCapacity<0) || (dest == NULL && destCapacity > 0)
    572     ) {
    573         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    574         return NULL;
    575     }
    576 
    577     if(srcLength < 0) {
    578         /* Transform a NUL-terminated string. */
    579         UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL;
    580         uint8_t t1, t2, t3; /* trail bytes */
    581 
    582         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
    583             if(ch < 0xc0) {
    584                 /*
    585                  * ASCII, or a trail byte in lead position which is treated like
    586                  * a single-byte sequence for better character boundary
    587                  * resynchronization after illegal sequences.
    588                  */
    589                 *pDest++=(UChar)ch;
    590                 ++pSrc;
    591                 continue;
    592             } else if(ch < 0xe0) { /* U+0080..U+07FF */
    593                 if((t1 = pSrc[1]) != 0) {
    594                     /* 0x3080 = (0xc0 << 6) + 0x80 */
    595                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
    596                     pSrc += 2;
    597                     continue;
    598                 }
    599             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    600                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
    601                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    602                     /* 0x2080 = (0x80 << 6) + 0x80 */
    603                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
    604                     pSrc += 3;
    605                     continue;
    606                 }
    607             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    608                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
    609                     pSrc += 4;
    610                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    611                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
    612                     *(pDest++) = U16_LEAD(ch);
    613                     if(pDest < pDestLimit) {
    614                         *(pDest++) = U16_TRAIL(ch);
    615                     } else {
    616                         reqLength = 1;
    617                         break;
    618                     }
    619                     continue;
    620                 }
    621             }
    622 
    623             /* truncated character at the end */
    624             *pDest++ = 0xfffd;
    625             while(*++pSrc != 0) {}
    626             break;
    627         }
    628 
    629         /* Pre-flight the rest of the string. */
    630         while((ch = *pSrc) != 0) {
    631             if(ch < 0xc0) {
    632                 /*
    633                  * ASCII, or a trail byte in lead position which is treated like
    634                  * a single-byte sequence for better character boundary
    635                  * resynchronization after illegal sequences.
    636                  */
    637                 ++reqLength;
    638                 ++pSrc;
    639                 continue;
    640             } else if(ch < 0xe0) { /* U+0080..U+07FF */
    641                 if(pSrc[1] != 0) {
    642                     ++reqLength;
    643                     pSrc += 2;
    644                     continue;
    645                 }
    646             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    647                 if(pSrc[1] != 0 && pSrc[2] != 0) {
    648                     ++reqLength;
    649                     pSrc += 3;
    650                     continue;
    651                 }
    652             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    653                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
    654                     reqLength += 2;
    655                     pSrc += 4;
    656                     continue;
    657                 }
    658             }
    659 
    660             /* truncated character at the end */
    661             ++reqLength;
    662             break;
    663         }
    664     } else /* srcLength >= 0 */ {
    665       const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL;
    666 
    667         /*
    668          * This function requires that if srcLength is given, then it must be
    669          * destCapatity >= srcLength so that we need not check for
    670          * destination buffer overflow in the loop.
    671          */
    672         if(destCapacity < srcLength) {
    673             if(pDestLength != NULL) {
    674                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
    675             }
    676             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
    677             return NULL;
    678         }
    679 
    680         if((pSrcLimit - pSrc) >= 4) {
    681             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
    682 
    683             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
    684             do {
    685                 ch = *pSrc++;
    686                 if(ch < 0xc0) {
    687                     /*
    688                      * ASCII, or a trail byte in lead position which is treated like
    689                      * a single-byte sequence for better character boundary
    690                      * resynchronization after illegal sequences.
    691                      */
    692                     *pDest++=(UChar)ch;
    693                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
    694                     /* 0x3080 = (0xc0 << 6) + 0x80 */
    695                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
    696                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    697                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    698                     /* 0x2080 = (0x80 << 6) + 0x80 */
    699                     ch = (ch << 12) + (*pSrc++ << 6);
    700                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
    701                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    702                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    703                     ch = (ch << 18) + (*pSrc++ << 12);
    704                     ch += *pSrc++ << 6;
    705                     ch += *pSrc++ - 0x3c82080;
    706                     *(pDest++) = U16_LEAD(ch);
    707                     *(pDest++) = U16_TRAIL(ch);
    708                 }
    709             } while(pSrc < pSrcLimit);
    710 
    711             pSrcLimit += 3; /* restore original pSrcLimit */
    712         }
    713 
    714         while(pSrc < pSrcLimit) {
    715             ch = *pSrc++;
    716             if(ch < 0xc0) {
    717                 /*
    718                  * ASCII, or a trail byte in lead position which is treated like
    719                  * a single-byte sequence for better character boundary
    720                  * resynchronization after illegal sequences.
    721                  */
    722                 *pDest++=(UChar)ch;
    723                 continue;
    724             } else if(ch < 0xe0) { /* U+0080..U+07FF */
    725                 if(pSrc < pSrcLimit) {
    726                     /* 0x3080 = (0xc0 << 6) + 0x80 */
    727                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
    728                     continue;
    729                 }
    730             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    731                 if((pSrcLimit - pSrc) >= 2) {
    732                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    733                     /* 0x2080 = (0x80 << 6) + 0x80 */
    734                     ch = (ch << 12) + (*pSrc++ << 6);
    735                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
    736                     pSrc += 3;
    737                     continue;
    738                 }
    739             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    740                 if((pSrcLimit - pSrc) >= 3) {
    741                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    742                     ch = (ch << 18) + (*pSrc++ << 12);
    743                     ch += *pSrc++ << 6;
    744                     ch += *pSrc++ - 0x3c82080;
    745                     *(pDest++) = U16_LEAD(ch);
    746                     *(pDest++) = U16_TRAIL(ch);
    747                     pSrc += 4;
    748                     continue;
    749                 }
    750             }
    751 
    752             /* truncated character at the end */
    753             *pDest++ = 0xfffd;
    754             break;
    755         }
    756     }
    757 
    758     reqLength+=(int32_t)(pDest - dest);
    759 
    760     if(pDestLength){
    761         *pDestLength = reqLength;
    762     }
    763 
    764     /* Terminate the buffer */
    765     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
    766 
    767     return dest;
    768 }
    769 
    770 static inline uint8_t *
    771 _appendUTF8(uint8_t *pDest, UChar32 c) {
    772     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
    773     if((c)<=0x7f) {
    774         *pDest++=(uint8_t)c;
    775     } else if(c<=0x7ff) {
    776         *pDest++=(uint8_t)((c>>6)|0xc0);
    777         *pDest++=(uint8_t)((c&0x3f)|0x80);
    778     } else if(c<=0xffff) {
    779         *pDest++=(uint8_t)((c>>12)|0xe0);
    780         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
    781         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
    782     } else /* if((uint32_t)(c)<=0x10ffff) */ {
    783         *pDest++=(uint8_t)(((c)>>18)|0xf0);
    784         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
    785         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
    786         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
    787     }
    788     return pDest;
    789 }
    790 
    791 
    792 U_CAPI char* U_EXPORT2
    793 u_strToUTF8WithSub(char *dest,
    794             int32_t destCapacity,
    795             int32_t *pDestLength,
    796             const UChar *pSrc,
    797             int32_t srcLength,
    798             UChar32 subchar, int32_t *pNumSubstitutions,
    799             UErrorCode *pErrorCode){
    800     int32_t reqLength=0;
    801     uint32_t ch=0,ch2=0;
    802     uint8_t *pDest = (uint8_t *)dest;
    803     uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL;
    804     int32_t numSubstitutions;
    805 
    806     /* args check */
    807     if(U_FAILURE(*pErrorCode)){
    808         return NULL;
    809     }
    810 
    811     if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
    812         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
    813         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    814     ) {
    815         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    816         return NULL;
    817     }
    818 
    819     if(pNumSubstitutions!=NULL) {
    820         *pNumSubstitutions=0;
    821     }
    822     numSubstitutions=0;
    823 
    824     if(srcLength==-1) {
    825         while((ch=*pSrc)!=0) {
    826             ++pSrc;
    827             if(ch <= 0x7f) {
    828                 if(pDest<pDestLimit) {
    829                     *pDest++ = (uint8_t)ch;
    830                 } else {
    831                     reqLength = 1;
    832                     break;
    833                 }
    834             } else if(ch <= 0x7ff) {
    835                 if((pDestLimit - pDest) >= 2) {
    836                     *pDest++=(uint8_t)((ch>>6)|0xc0);
    837                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
    838                 } else {
    839                     reqLength = 2;
    840                     break;
    841                 }
    842             } else if(ch <= 0xd7ff || ch >= 0xe000) {
    843                 if((pDestLimit - pDest) >= 3) {
    844                     *pDest++=(uint8_t)((ch>>12)|0xe0);
    845                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
    846                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
    847                 } else {
    848                     reqLength = 3;
    849                     break;
    850                 }
    851             } else /* ch is a surrogate */ {
    852                 int32_t length;
    853 
    854                 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/
    855                 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
    856                     ++pSrc;
    857                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
    858                 } else if(subchar>=0) {
    859                     ch=subchar;
    860                     ++numSubstitutions;
    861                 } else {
    862                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
    863                     *pErrorCode = U_INVALID_CHAR_FOUND;
    864                     return NULL;
    865                 }
    866 
    867                 length = U8_LENGTH(ch);
    868                 if((pDestLimit - pDest) >= length) {
    869                     /* convert and append*/
    870                     pDest=_appendUTF8(pDest, ch);
    871                 } else {
    872                     reqLength = length;
    873                     break;
    874                 }
    875             }
    876         }
    877         while((ch=*pSrc++)!=0) {
    878             if(ch<=0x7f) {
    879                 ++reqLength;
    880             } else if(ch<=0x7ff) {
    881                 reqLength+=2;
    882             } else if(!U16_IS_SURROGATE(ch)) {
    883                 reqLength+=3;
    884             } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
    885                 ++pSrc;
    886                 reqLength+=4;
    887             } else if(subchar>=0) {
    888                 reqLength+=U8_LENGTH(subchar);
    889                 ++numSubstitutions;
    890             } else {
    891                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
    892                 *pErrorCode = U_INVALID_CHAR_FOUND;
    893                 return NULL;
    894             }
    895         }
    896     } else {
    897         const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL;
    898         int32_t count;
    899 
    900         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
    901         for(;;) {
    902             /*
    903              * Each iteration of the inner loop progresses by at most 3 UTF-8
    904              * bytes and one UChar, for most characters.
    905              * For supplementary code points (4 & 2), which are rare,
    906              * there is an additional adjustment.
    907              */
    908             count = (int32_t)((pDestLimit - pDest) / 3);
    909             srcLength = (int32_t)(pSrcLimit - pSrc);
    910             if(count > srcLength) {
    911                 count = srcLength; /* min(remaining dest/3, remaining src) */
    912             }
    913             if(count < 3) {
    914                 /*
    915                  * Too much overhead if we get near the end of the string,
    916                  * continue with the next loop.
    917                  */
    918                 break;
    919             }
    920             do {
    921                 ch=*pSrc++;
    922                 if(ch <= 0x7f) {
    923                     *pDest++ = (uint8_t)ch;
    924                 } else if(ch <= 0x7ff) {
    925                     *pDest++=(uint8_t)((ch>>6)|0xc0);
    926                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
    927                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
    928                     *pDest++=(uint8_t)((ch>>12)|0xe0);
    929                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
    930                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
    931                 } else /* ch is a surrogate */ {
    932                     /*
    933                      * We will read two UChars and probably output four bytes,
    934                      * which we didn't account for with computing count,
    935                      * so we adjust it here.
    936                      */
    937                     if(--count == 0) {
    938                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
    939                         break;  /* recompute count */
    940                     }
    941 
    942                     if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) {
    943                         ++pSrc;
    944                         ch=U16_GET_SUPPLEMENTARY(ch, ch2);
    945 
    946                         /* writing 4 bytes per 2 UChars is ok */
    947                         *pDest++=(uint8_t)((ch>>18)|0xf0);
    948                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
    949                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
    950                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
    951                     } else  {
    952                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
    953                         if(subchar>=0) {
    954                             ch=subchar;
    955                             ++numSubstitutions;
    956                         } else {
    957                             *pErrorCode = U_INVALID_CHAR_FOUND;
    958                             return NULL;
    959                         }
    960 
    961                         /* convert and append*/
    962                         pDest=_appendUTF8(pDest, ch);
    963                     }
    964                 }
    965             } while(--count > 0);
    966         }
    967 
    968         while(pSrc<pSrcLimit) {
    969             ch=*pSrc++;
    970             if(ch <= 0x7f) {
    971                 if(pDest<pDestLimit) {
    972                     *pDest++ = (uint8_t)ch;
    973                 } else {
    974                     reqLength = 1;
    975                     break;
    976                 }
    977             } else if(ch <= 0x7ff) {
    978                 if((pDestLimit - pDest) >= 2) {
    979                     *pDest++=(uint8_t)((ch>>6)|0xc0);
    980                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
    981                 } else {
    982                     reqLength = 2;
    983                     break;
    984                 }
    985             } else if(ch <= 0xd7ff || ch >= 0xe000) {
    986                 if((pDestLimit - pDest) >= 3) {
    987                     *pDest++=(uint8_t)((ch>>12)|0xe0);
    988                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
    989                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
    990                 } else {
    991                     reqLength = 3;
    992                     break;
    993                 }
    994             } else /* ch is a surrogate */ {
    995                 int32_t length;
    996 
    997                 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
    998                     ++pSrc;
    999                     ch=U16_GET_SUPPLEMENTARY(ch, ch2);
   1000                 } else if(subchar>=0) {
   1001                     ch=subchar;
   1002                     ++numSubstitutions;
   1003                 } else {
   1004                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1005                     *pErrorCode = U_INVALID_CHAR_FOUND;
   1006                     return NULL;
   1007                 }
   1008 
   1009                 length = U8_LENGTH(ch);
   1010                 if((pDestLimit - pDest) >= length) {
   1011                     /* convert and append*/
   1012                     pDest=_appendUTF8(pDest, ch);
   1013                 } else {
   1014                     reqLength = length;
   1015                     break;
   1016                 }
   1017             }
   1018         }
   1019         while(pSrc<pSrcLimit) {
   1020             ch=*pSrc++;
   1021             if(ch<=0x7f) {
   1022                 ++reqLength;
   1023             } else if(ch<=0x7ff) {
   1024                 reqLength+=2;
   1025             } else if(!U16_IS_SURROGATE(ch)) {
   1026                 reqLength+=3;
   1027             } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) {
   1028                 ++pSrc;
   1029                 reqLength+=4;
   1030             } else if(subchar>=0) {
   1031                 reqLength+=U8_LENGTH(subchar);
   1032                 ++numSubstitutions;
   1033             } else {
   1034                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1035                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1036                 return NULL;
   1037             }
   1038         }
   1039     }
   1040 
   1041     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
   1042 
   1043     if(pNumSubstitutions!=NULL) {
   1044         *pNumSubstitutions=numSubstitutions;
   1045     }
   1046 
   1047     if(pDestLength){
   1048         *pDestLength = reqLength;
   1049     }
   1050 
   1051     /* Terminate the buffer */
   1052     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1053     return dest;
   1054 }
   1055 
   1056 U_CAPI char* U_EXPORT2
   1057 u_strToUTF8(char *dest,
   1058             int32_t destCapacity,
   1059             int32_t *pDestLength,
   1060             const UChar *pSrc,
   1061             int32_t srcLength,
   1062             UErrorCode *pErrorCode){
   1063     return u_strToUTF8WithSub(
   1064             dest, destCapacity, pDestLength,
   1065             pSrc, srcLength,
   1066             U_SENTINEL, NULL,
   1067             pErrorCode);
   1068 }
   1069 
   1070 U_CAPI UChar* U_EXPORT2
   1071 u_strFromJavaModifiedUTF8WithSub(
   1072         UChar *dest,
   1073         int32_t destCapacity,
   1074         int32_t *pDestLength,
   1075         const char *src,
   1076         int32_t srcLength,
   1077         UChar32 subchar, int32_t *pNumSubstitutions,
   1078         UErrorCode *pErrorCode) {
   1079     /* args check */
   1080     if(U_FAILURE(*pErrorCode)) {
   1081         return NULL;
   1082     }
   1083     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
   1084         (dest==NULL && destCapacity!=0) || destCapacity<0 ||
   1085         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
   1086     ) {
   1087         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1088         return NULL;
   1089     }
   1090 
   1091     if(pNumSubstitutions!=NULL) {
   1092         *pNumSubstitutions=0;
   1093     }
   1094     UChar *pDest = dest;
   1095     UChar *pDestLimit = dest+destCapacity;
   1096     int32_t reqLength = 0;
   1097     int32_t numSubstitutions=0;
   1098 
   1099     if(srcLength < 0) {
   1100         /*
   1101          * Transform a NUL-terminated ASCII string.
   1102          * Handle non-ASCII strings with slower code.
   1103          */
   1104         UChar32 c;
   1105         while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) {
   1106             *pDest++=(UChar)c;
   1107             ++src;
   1108         }
   1109         if(c == 0) {
   1110             reqLength=(int32_t)(pDest - dest);
   1111             if(pDestLength) {
   1112                 *pDestLength = reqLength;
   1113             }
   1114 
   1115             /* Terminate the buffer */
   1116             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
   1117             return dest;
   1118         }
   1119         srcLength = static_cast<int32_t>(uprv_strlen(src));
   1120     }
   1121 
   1122     /* Faster loop without ongoing checking for srcLength and pDestLimit. */
   1123     UChar32 ch;
   1124     uint8_t t1, t2;
   1125     int32_t i = 0;
   1126     for(;;) {
   1127         int32_t count = (int32_t)(pDestLimit - pDest);
   1128         int32_t count2 = srcLength - i;
   1129         if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) {
   1130             /* fast ASCII loop */
   1131             int32_t start = i;
   1132             uint8_t b;
   1133             while(i < srcLength && U8_IS_SINGLE(b = src[i])) {
   1134                 *pDest++=b;
   1135                 ++i;
   1136             }
   1137             int32_t delta = i - start;
   1138             count -= delta;
   1139             count2 -= delta;
   1140         }
   1141         /*
   1142          * Each iteration of the inner loop progresses by at most 3 UTF-8
   1143          * bytes and one UChar.
   1144          */
   1145         if(subchar > 0xFFFF) {
   1146             break;
   1147         }
   1148         count2 /= 3;
   1149         if(count > count2) {
   1150             count = count2; /* min(remaining dest, remaining src/3) */
   1151         }
   1152         if(count < 3) {
   1153             /*
   1154              * Too much overhead if we get near the end of the string,
   1155              * continue with the next loop.
   1156              */
   1157             break;
   1158         }
   1159         do {
   1160             ch = (uint8_t)src[i++];
   1161             if(U8_IS_SINGLE(ch)) {
   1162                 *pDest++=(UChar)ch;
   1163             } else {
   1164                 if(ch >= 0xe0) {
   1165                     if( /* handle U+0000..U+FFFF inline */
   1166                         ch <= 0xef &&
   1167                         (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
   1168                         (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
   1169                     ) {
   1170                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
   1171                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
   1172                         i += 2;
   1173                         continue;
   1174                     }
   1175                 } else {
   1176                     if( /* handle U+0000..U+07FF inline */
   1177                         ch >= 0xc0 &&
   1178                         (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
   1179                     ) {
   1180                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
   1181                         ++i;
   1182                         continue;
   1183                     }
   1184                 }
   1185 
   1186                 if(subchar < 0) {
   1187                     *pErrorCode = U_INVALID_CHAR_FOUND;
   1188                     return NULL;
   1189                 } else if(subchar > 0xffff && --count == 0) {
   1190                     /*
   1191                      * We need to write two UChars, adjusted count for that,
   1192                      * and ran out of space.
   1193                      */
   1194                     --i;  // back out byte ch
   1195                     break;
   1196                 } else {
   1197                     /* function call for error cases */
   1198                     utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
   1199                     ++numSubstitutions;
   1200                     *(pDest++)=(UChar)subchar;
   1201                 }
   1202             }
   1203         } while(--count > 0);
   1204     }
   1205 
   1206     while(i < srcLength && (pDest < pDestLimit)) {
   1207         ch = (uint8_t)src[i++];
   1208         if(U8_IS_SINGLE(ch)){
   1209             *pDest++=(UChar)ch;
   1210         } else {
   1211             if(ch >= 0xe0) {
   1212                 if( /* handle U+0000..U+FFFF inline */
   1213                     ch <= 0xef &&
   1214                     (i+1) < srcLength &&
   1215                     (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f &&
   1216                     (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f
   1217                 ) {
   1218                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
   1219                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
   1220                     i += 2;
   1221                     continue;
   1222                 }
   1223             } else {
   1224                 if( /* handle U+0000..U+07FF inline */
   1225                     ch >= 0xc0 &&
   1226                     i < srcLength &&
   1227                     (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f
   1228                 ) {
   1229                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
   1230                     ++i;
   1231                     continue;
   1232                 }
   1233             }
   1234 
   1235             if(subchar < 0) {
   1236                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1237                 return NULL;
   1238             } else {
   1239                 /* function call for error cases */
   1240                 utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
   1241                 ++numSubstitutions;
   1242                 if(subchar<=0xFFFF) {
   1243                     *(pDest++)=(UChar)subchar;
   1244                 } else {
   1245                     *(pDest++)=U16_LEAD(subchar);
   1246                     if(pDest<pDestLimit) {
   1247                         *(pDest++)=U16_TRAIL(subchar);
   1248                     } else {
   1249                         reqLength++;
   1250                         break;
   1251                     }
   1252                 }
   1253             }
   1254         }
   1255     }
   1256 
   1257     /* Pre-flight the rest of the string. */
   1258     while(i < srcLength) {
   1259         ch = (uint8_t)src[i++];
   1260         if(U8_IS_SINGLE(ch)) {
   1261             reqLength++;
   1262         } else {
   1263             if(ch >= 0xe0) {
   1264                 if( /* handle U+0000..U+FFFF inline */
   1265                     ch <= 0xef &&
   1266                     (i+1) < srcLength &&
   1267                     (uint8_t)(src[i] - 0x80) <= 0x3f &&
   1268                     (uint8_t)(src[i+1] - 0x80) <= 0x3f
   1269                 ) {
   1270                     reqLength++;
   1271                     i += 2;
   1272                     continue;
   1273                 }
   1274             } else {
   1275                 if( /* handle U+0000..U+07FF inline */
   1276                     ch >= 0xc0 &&
   1277                     i < srcLength &&
   1278                     (uint8_t)(src[i] - 0x80) <= 0x3f
   1279                 ) {
   1280                     reqLength++;
   1281                     ++i;
   1282                     continue;
   1283                 }
   1284             }
   1285 
   1286             if(subchar < 0) {
   1287                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1288                 return NULL;
   1289             } else {
   1290                 /* function call for error cases */
   1291                 utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1);
   1292                 ++numSubstitutions;
   1293                 reqLength+=U16_LENGTH(ch);
   1294             }
   1295         }
   1296     }
   1297 
   1298     if(pNumSubstitutions!=NULL) {
   1299         *pNumSubstitutions=numSubstitutions;
   1300     }
   1301 
   1302     reqLength+=(int32_t)(pDest - dest);
   1303     if(pDestLength) {
   1304         *pDestLength = reqLength;
   1305     }
   1306 
   1307     /* Terminate the buffer */
   1308     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
   1309     return dest;
   1310 }
   1311 
   1312 U_CAPI char* U_EXPORT2
   1313 u_strToJavaModifiedUTF8(
   1314         char *dest,
   1315         int32_t destCapacity,
   1316         int32_t *pDestLength,
   1317         const UChar *src,
   1318         int32_t srcLength,
   1319         UErrorCode *pErrorCode) {
   1320     int32_t reqLength=0;
   1321     uint32_t ch=0;
   1322     uint8_t *pDest = (uint8_t *)dest;
   1323     uint8_t *pDestLimit = pDest + destCapacity;
   1324     const UChar *pSrcLimit;
   1325     int32_t count;
   1326 
   1327     /* args check */
   1328     if(U_FAILURE(*pErrorCode)){
   1329         return NULL;
   1330     }
   1331     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
   1332         (dest==NULL && destCapacity!=0) || destCapacity<0
   1333     ) {
   1334         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1335         return NULL;
   1336     }
   1337 
   1338     if(srcLength==-1) {
   1339         /* Convert NUL-terminated ASCII, then find the string length. */
   1340         while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
   1341             *pDest++ = (uint8_t)ch;
   1342             ++src;
   1343         }
   1344         if(ch == 0) {
   1345             reqLength=(int32_t)(pDest - (uint8_t *)dest);
   1346             if(pDestLength) {
   1347                 *pDestLength = reqLength;
   1348             }
   1349 
   1350             /* Terminate the buffer */
   1351             u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1352             return dest;
   1353         }
   1354         srcLength = u_strlen(src);
   1355     }
   1356 
   1357     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
   1358     pSrcLimit = (src!=NULL)?(src+srcLength):NULL;
   1359     for(;;) {
   1360         count = (int32_t)(pDestLimit - pDest);
   1361         srcLength = (int32_t)(pSrcLimit - src);
   1362         if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
   1363             /* fast ASCII loop */
   1364             const UChar *prevSrc = src;
   1365             int32_t delta;
   1366             while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
   1367                 *pDest++=(uint8_t)ch;
   1368                 ++src;
   1369             }
   1370             delta = (int32_t)(src - prevSrc);
   1371             count -= delta;
   1372             srcLength -= delta;
   1373         }
   1374         /*
   1375          * Each iteration of the inner loop progresses by at most 3 UTF-8
   1376          * bytes and one UChar.
   1377          */
   1378         count /= 3;
   1379         if(count > srcLength) {
   1380             count = srcLength; /* min(remaining dest/3, remaining src) */
   1381         }
   1382         if(count < 3) {
   1383             /*
   1384              * Too much overhead if we get near the end of the string,
   1385              * continue with the next loop.
   1386              */
   1387             break;
   1388         }
   1389         do {
   1390             ch=*src++;
   1391             if(ch <= 0x7f && ch != 0) {
   1392                 *pDest++ = (uint8_t)ch;
   1393             } else if(ch <= 0x7ff) {
   1394                 *pDest++=(uint8_t)((ch>>6)|0xc0);
   1395                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1396             } else {
   1397                 *pDest++=(uint8_t)((ch>>12)|0xe0);
   1398                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1399                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1400             }
   1401         } while(--count > 0);
   1402     }
   1403 
   1404     while(src<pSrcLimit) {
   1405         ch=*src++;
   1406         if(ch <= 0x7f && ch != 0) {
   1407             if(pDest<pDestLimit) {
   1408                 *pDest++ = (uint8_t)ch;
   1409             } else {
   1410                 reqLength = 1;
   1411                 break;
   1412             }
   1413         } else if(ch <= 0x7ff) {
   1414             if((pDestLimit - pDest) >= 2) {
   1415                 *pDest++=(uint8_t)((ch>>6)|0xc0);
   1416                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1417             } else {
   1418                 reqLength = 2;
   1419                 break;
   1420             }
   1421         } else {
   1422             if((pDestLimit - pDest) >= 3) {
   1423                 *pDest++=(uint8_t)((ch>>12)|0xe0);
   1424                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1425                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1426             } else {
   1427                 reqLength = 3;
   1428                 break;
   1429             }
   1430         }
   1431     }
   1432     while(src<pSrcLimit) {
   1433         ch=*src++;
   1434         if(ch <= 0x7f && ch != 0) {
   1435             ++reqLength;
   1436         } else if(ch<=0x7ff) {
   1437             reqLength+=2;
   1438         } else {
   1439             reqLength+=3;
   1440         }
   1441     }
   1442 
   1443     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
   1444     if(pDestLength){
   1445         *pDestLength = reqLength;
   1446     }
   1447 
   1448     /* Terminate the buffer */
   1449     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1450     return dest;
   1451 }
   1452