Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2001-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *
      9 * File ustrtrns.c
     10 *
     11 * Modification History:
     12 *
     13 *   Date        Name        Description
     14 *   9/10/2001    Ram    Creation.
     15 ******************************************************************************
     16 */
     17 
     18 /*******************************************************************************
     19  *
     20  * u_strTo* and u_strFrom* APIs
     21  * WCS functions moved to ustr_wcs.c for better modularization
     22  *
     23  *******************************************************************************
     24  */
     25 
     26 
     27 #include "unicode/putil.h"
     28 #include "unicode/ustring.h"
     29 #include "cstring.h"
     30 #include "cmemory.h"
     31 #include "ustr_imp.h"
     32 
     33 U_CAPI UChar* U_EXPORT2
     34 u_strFromUTF32WithSub(UChar *dest,
     35                int32_t destCapacity,
     36                int32_t *pDestLength,
     37                const UChar32 *src,
     38                int32_t srcLength,
     39                UChar32 subchar, int32_t *pNumSubstitutions,
     40                UErrorCode *pErrorCode) {
     41     const UChar32 *srcLimit;
     42     UChar32 ch;
     43     UChar *destLimit;
     44     UChar *pDest;
     45     int32_t reqLength;
     46     int32_t numSubstitutions;
     47 
     48     /* args check */
     49     if(U_FAILURE(*pErrorCode)){
     50         return NULL;
     51     }
     52     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
     53         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
     54         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
     55     ) {
     56         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
     57         return NULL;
     58     }
     59 
     60     if(pNumSubstitutions != NULL) {
     61         *pNumSubstitutions = 0;
     62     }
     63 
     64     pDest = dest;
     65     destLimit = dest + destCapacity;
     66     reqLength = 0;
     67     numSubstitutions = 0;
     68 
     69     if(srcLength < 0) {
     70         /* simple loop for conversion of a NUL-terminated BMP string */
     71         while((ch=*src) != 0 &&
     72               ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) {
     73             ++src;
     74             if(pDest < destLimit) {
     75                 *pDest++ = (UChar)ch;
     76             } else {
     77                 ++reqLength;
     78             }
     79         }
     80         srcLimit = src;
     81         if(ch != 0) {
     82             /* "complicated" case, find the end of the remaining string */
     83             while(*++srcLimit != 0) {}
     84         }
     85     } else {
     86         srcLimit = src + srcLength;
     87     }
     88 
     89     /* convert with length */
     90     while(src < srcLimit) {
     91         ch = *src++;
     92         do {
     93             /* usually "loops" once; twice only for writing subchar */
     94             if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) {
     95                 if(pDest < destLimit) {
     96                     *pDest++ = (UChar)ch;
     97                 } else {
     98                     ++reqLength;
     99                 }
    100                 break;
    101             } else if(0x10000 <= ch && ch <= 0x10ffff) {
    102                 if((pDest + 2) <= destLimit) {
    103                     *pDest++ = U16_LEAD(ch);
    104                     *pDest++ = U16_TRAIL(ch);
    105                 } else {
    106                     reqLength += 2;
    107                 }
    108                 break;
    109             } else if((ch = subchar) < 0) {
    110                 /* surrogate code point, or not a Unicode code point at all */
    111                 *pErrorCode = U_INVALID_CHAR_FOUND;
    112                 return NULL;
    113             } else {
    114                 ++numSubstitutions;
    115             }
    116         } while(TRUE);
    117     }
    118 
    119     reqLength += (int32_t)(pDest - dest);
    120     if(pDestLength) {
    121         *pDestLength = reqLength;
    122     }
    123     if(pNumSubstitutions != NULL) {
    124         *pNumSubstitutions = numSubstitutions;
    125     }
    126 
    127     /* Terminate the buffer */
    128     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
    129 
    130     return dest;
    131 }
    132 
    133 U_CAPI UChar* U_EXPORT2
    134 u_strFromUTF32(UChar *dest,
    135                int32_t destCapacity,
    136                int32_t *pDestLength,
    137                const UChar32 *src,
    138                int32_t srcLength,
    139                UErrorCode *pErrorCode) {
    140     return u_strFromUTF32WithSub(
    141             dest, destCapacity, pDestLength,
    142             src, srcLength,
    143             U_SENTINEL, NULL,
    144             pErrorCode);
    145 }
    146 
    147 U_CAPI UChar32* U_EXPORT2
    148 u_strToUTF32WithSub(UChar32 *dest,
    149              int32_t destCapacity,
    150              int32_t *pDestLength,
    151              const UChar *src,
    152              int32_t srcLength,
    153              UChar32 subchar, int32_t *pNumSubstitutions,
    154              UErrorCode *pErrorCode) {
    155     const UChar *srcLimit;
    156     UChar32 ch;
    157     UChar ch2;
    158     UChar32 *destLimit;
    159     UChar32 *pDest;
    160     int32_t reqLength;
    161     int32_t numSubstitutions;
    162 
    163     /* args check */
    164     if(U_FAILURE(*pErrorCode)){
    165         return NULL;
    166     }
    167     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
    168         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
    169         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    170     ) {
    171         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    172         return NULL;
    173     }
    174 
    175     if(pNumSubstitutions != NULL) {
    176         *pNumSubstitutions = 0;
    177     }
    178 
    179     pDest = dest;
    180     destLimit = dest + destCapacity;
    181     reqLength = 0;
    182     numSubstitutions = 0;
    183 
    184     if(srcLength < 0) {
    185         /* simple loop for conversion of a NUL-terminated BMP string */
    186         while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) {
    187             ++src;
    188             if(pDest < destLimit) {
    189                 *pDest++ = ch;
    190             } else {
    191                 ++reqLength;
    192             }
    193         }
    194         srcLimit = src;
    195         if(ch != 0) {
    196             /* "complicated" case, find the end of the remaining string */
    197             while(*++srcLimit != 0) {}
    198         }
    199     } else {
    200         srcLimit = src + srcLength;
    201     }
    202 
    203     /* convert with length */
    204     while(src < srcLimit) {
    205         ch = *src++;
    206         if(!U16_IS_SURROGATE(ch)) {
    207             /* write or count ch below */
    208         } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) {
    209             ++src;
    210             ch = U16_GET_SUPPLEMENTARY(ch, ch2);
    211         } else if((ch = subchar) < 0) {
    212             /* unpaired surrogate */
    213             *pErrorCode = U_INVALID_CHAR_FOUND;
    214             return NULL;
    215         } else {
    216             ++numSubstitutions;
    217         }
    218         if(pDest < destLimit) {
    219             *pDest++ = ch;
    220         } else {
    221             ++reqLength;
    222         }
    223     }
    224 
    225     reqLength += (int32_t)(pDest - dest);
    226     if(pDestLength) {
    227         *pDestLength = reqLength;
    228     }
    229     if(pNumSubstitutions != NULL) {
    230         *pNumSubstitutions = numSubstitutions;
    231     }
    232 
    233     /* Terminate the buffer */
    234     u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode);
    235 
    236     return dest;
    237 }
    238 
    239 U_CAPI UChar32* U_EXPORT2
    240 u_strToUTF32(UChar32 *dest,
    241              int32_t destCapacity,
    242              int32_t *pDestLength,
    243              const UChar *src,
    244              int32_t srcLength,
    245              UErrorCode *pErrorCode) {
    246     return u_strToUTF32WithSub(
    247             dest, destCapacity, pDestLength,
    248             src, srcLength,
    249             U_SENTINEL, NULL,
    250             pErrorCode);
    251 }
    252 
    253 /* for utf8_nextCharSafeBodyTerminated() */
    254 static const UChar32
    255 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 };
    256 
    257 /*
    258  * Version of utf8_nextCharSafeBody() with the following differences:
    259  * - checks for NUL termination instead of length
    260  * - works with pointers instead of indexes
    261  * - always strict (strict==-1)
    262  *
    263  * *ps points to after the lead byte and will be moved to after the last trail byte.
    264  * c is the lead byte.
    265  * @return the code point, or U_SENTINEL
    266  */
    267 static UChar32
    268 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) {
    269     const uint8_t *s=*ps;
    270     uint8_t trail, illegal=0;
    271     uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
    272     UTF8_MASK_LEAD_BYTE((c), count);
    273     /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
    274     switch(count) {
    275     /* each branch falls through to the next one */
    276     case 5:
    277     case 4:
    278         /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
    279         illegal=1;
    280         break;
    281     case 3:
    282         trail=(uint8_t)(*s++ - 0x80);
    283         c=(c<<6)|trail;
    284         if(trail>0x3f || c>=0x110) {
    285             /* not a trail byte, or code point>0x10ffff (outside Unicode) */
    286             illegal=1;
    287             break;
    288         }
    289     case 2:
    290         trail=(uint8_t)(*s++ - 0x80);
    291         if(trail>0x3f) {
    292             /* not a trail byte */
    293             illegal=1;
    294             break;
    295         }
    296         c=(c<<6)|trail;
    297     case 1:
    298         trail=(uint8_t)(*s++ - 0x80);
    299         if(trail>0x3f) {
    300             /* not a trail byte */
    301             illegal=1;
    302         }
    303         c=(c<<6)|trail;
    304         break;
    305     case 0:
    306         return U_SENTINEL;
    307     /* no default branch to optimize switch()  - all values are covered */
    308     }
    309 
    310     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
    311     /* illegal is also set if count>=4 */
    312     if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
    313         /* error handling */
    314         /* don't go beyond this sequence */
    315         s=*ps;
    316         while(count>0 && UTF8_IS_TRAIL(*s)) {
    317             ++s;
    318             --count;
    319         }
    320         c=U_SENTINEL;
    321     }
    322     *ps=s;
    323     return c;
    324 }
    325 
    326 /*
    327  * Version of utf8_nextCharSafeBody() with the following differences:
    328  * - works with pointers instead of indexes
    329  * - always strict (strict==-1)
    330  *
    331  * *ps points to after the lead byte and will be moved to after the last trail byte.
    332  * c is the lead byte.
    333  * @return the code point, or U_SENTINEL
    334  */
    335 static UChar32
    336 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) {
    337     const uint8_t *s=*ps;
    338     uint8_t trail, illegal=0;
    339     uint8_t count=UTF8_COUNT_TRAIL_BYTES(c);
    340     if((limit-s)>=count) {
    341         UTF8_MASK_LEAD_BYTE((c), count);
    342         /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */
    343         switch(count) {
    344         /* each branch falls through to the next one */
    345         case 5:
    346         case 4:
    347             /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
    348             illegal=1;
    349             break;
    350         case 3:
    351             trail=*s++;
    352             c=(c<<6)|(trail&0x3f);
    353             if(c<0x110) {
    354                 illegal|=(trail&0xc0)^0x80;
    355             } else {
    356                 /* code point>0x10ffff, outside Unicode */
    357                 illegal=1;
    358                 break;
    359             }
    360         case 2:
    361             trail=*s++;
    362             c=(c<<6)|(trail&0x3f);
    363             illegal|=(trail&0xc0)^0x80;
    364         case 1:
    365             trail=*s++;
    366             c=(c<<6)|(trail&0x3f);
    367             illegal|=(trail&0xc0)^0x80;
    368             break;
    369         case 0:
    370             return U_SENTINEL;
    371         /* no default branch to optimize switch()  - all values are covered */
    372         }
    373     } else {
    374         illegal=1; /* too few bytes left */
    375     }
    376 
    377     /* correct sequence - all trail bytes have (b7..b6)==(10)? */
    378     /* illegal is also set if count>=4 */
    379     if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) {
    380         /* error handling */
    381         /* don't go beyond this sequence */
    382         s=*ps;
    383         while(count>0 && s<limit && UTF8_IS_TRAIL(*s)) {
    384             ++s;
    385             --count;
    386         }
    387         c=U_SENTINEL;
    388     }
    389     *ps=s;
    390     return c;
    391 }
    392 
    393 U_CAPI UChar* U_EXPORT2
    394 u_strFromUTF8WithSub(UChar *dest,
    395               int32_t destCapacity,
    396               int32_t *pDestLength,
    397               const char* src,
    398               int32_t srcLength,
    399               UChar32 subchar, int32_t *pNumSubstitutions,
    400               UErrorCode *pErrorCode){
    401     UChar *pDest = dest;
    402     UChar *pDestLimit = dest+destCapacity;
    403     UChar32 ch;
    404     int32_t reqLength = 0;
    405     const uint8_t* pSrc = (const uint8_t*) src;
    406     uint8_t t1, t2; /* trail bytes */
    407     int32_t numSubstitutions;
    408 
    409     /* args check */
    410     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
    411         return NULL;
    412     }
    413 
    414     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
    415         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
    416         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    417     ) {
    418         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    419         return NULL;
    420     }
    421 
    422     if(pNumSubstitutions!=NULL) {
    423         *pNumSubstitutions=0;
    424     }
    425     numSubstitutions=0;
    426 
    427     /*
    428      * Inline processing of UTF-8 byte sequences:
    429      *
    430      * Byte sequences for the most common characters are handled inline in
    431      * the conversion loops. In order to reduce the path lengths for those
    432      * characters, the tests are arranged in a kind of binary search.
    433      * ASCII (<=0x7f) is checked first, followed by the dividing point
    434      * between 2- and 3-byte sequences (0xe0).
    435      * The 3-byte branch is tested first to speed up CJK text.
    436      * The compiler should combine the subtractions for the two tests for 0xe0.
    437      * Each branch then tests for the other end of its range.
    438      */
    439 
    440     if(srcLength < 0){
    441         /*
    442          * Transform a NUL-terminated string.
    443          * The code explicitly checks for NULs only in the lead byte position.
    444          * A NUL byte in the trail byte position fails the trail byte range check anyway.
    445          */
    446         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
    447             if(ch <= 0x7f){
    448                 *pDest++=(UChar)ch;
    449                 ++pSrc;
    450             } else {
    451                 if(ch > 0xe0) {
    452                     if( /* handle U+1000..U+CFFF inline */
    453                         ch <= 0xec &&
    454                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
    455                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
    456                     ) {
    457                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    458                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
    459                         pSrc += 3;
    460                         continue;
    461                     }
    462                 } else if(ch < 0xe0) {
    463                     if( /* handle U+0080..U+07FF inline */
    464                         ch >= 0xc2 &&
    465                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
    466                     ) {
    467                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
    468                         pSrc += 2;
    469                         continue;
    470                     }
    471                 }
    472 
    473                 /* function call for "complicated" and error cases */
    474                 ++pSrc; /* continue after the lead byte */
    475                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
    476                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
    477                     *pErrorCode = U_INVALID_CHAR_FOUND;
    478                     return NULL;
    479                 } else if(ch<=0xFFFF) {
    480                     *(pDest++)=(UChar)ch;
    481                 } else {
    482                     *(pDest++)=UTF16_LEAD(ch);
    483                     if(pDest<pDestLimit) {
    484                         *(pDest++)=UTF16_TRAIL(ch);
    485                     } else {
    486                         reqLength++;
    487                         break;
    488                     }
    489                 }
    490             }
    491         }
    492 
    493         /* Pre-flight the rest of the string. */
    494         while((ch = *pSrc) != 0) {
    495             if(ch <= 0x7f){
    496                 ++reqLength;
    497                 ++pSrc;
    498             } else {
    499                 if(ch > 0xe0) {
    500                     if( /* handle U+1000..U+CFFF inline */
    501                         ch <= 0xec &&
    502                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
    503                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
    504                     ) {
    505                         ++reqLength;
    506                         pSrc += 3;
    507                         continue;
    508                     }
    509                 } else if(ch < 0xe0) {
    510                     if( /* handle U+0080..U+07FF inline */
    511                         ch >= 0xc2 &&
    512                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
    513                     ) {
    514                         ++reqLength;
    515                         pSrc += 2;
    516                         continue;
    517                     }
    518                 }
    519 
    520                 /* function call for "complicated" and error cases */
    521                 ++pSrc; /* continue after the lead byte */
    522                 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch);
    523                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) {
    524                     *pErrorCode = U_INVALID_CHAR_FOUND;
    525                     return NULL;
    526                 }
    527                 reqLength += U16_LENGTH(ch);
    528             }
    529         }
    530     } else /* srcLength >= 0 */ {
    531         const uint8_t *pSrcLimit = pSrc + srcLength;
    532         int32_t count;
    533 
    534         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
    535         for(;;) {
    536             /*
    537              * Each iteration of the inner loop progresses by at most 3 UTF-8
    538              * bytes and one UChar, for most characters.
    539              * For supplementary code points (4 & 2), which are rare,
    540              * there is an additional adjustment.
    541              */
    542             count = (int32_t)(pDestLimit - pDest);
    543             srcLength = (int32_t)((pSrcLimit - pSrc) / 3);
    544             if(count > srcLength) {
    545                 count = srcLength; /* min(remaining dest, remaining src/3) */
    546             }
    547             if(count < 3) {
    548                 /*
    549                  * Too much overhead if we get near the end of the string,
    550                  * continue with the next loop.
    551                  */
    552                 break;
    553             }
    554 
    555             do {
    556                 ch = *pSrc;
    557                 if(ch <= 0x7f){
    558                     *pDest++=(UChar)ch;
    559                     ++pSrc;
    560                 } else {
    561                     if(ch > 0xe0) {
    562                         if( /* handle U+1000..U+CFFF inline */
    563                             ch <= 0xec &&
    564                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
    565                             (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
    566                         ) {
    567                             /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    568                             *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
    569                             pSrc += 3;
    570                             continue;
    571                         }
    572                     } else if(ch < 0xe0) {
    573                         if( /* handle U+0080..U+07FF inline */
    574                             ch >= 0xc2 &&
    575                             (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
    576                         ) {
    577                             *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
    578                             pSrc += 2;
    579                             continue;
    580                         }
    581                     }
    582 
    583                     if(ch >= 0xf0 || subchar > 0xffff) {
    584                         /*
    585                          * We may read up to six bytes and write up to two UChars,
    586                          * which we didn't account for with computing count,
    587                          * so we adjust it here.
    588                          */
    589                         if(--count == 0) {
    590                             break;
    591                         }
    592                     }
    593 
    594                     /* function call for "complicated" and error cases */
    595                     ++pSrc; /* continue after the lead byte */
    596                     ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
    597                     if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
    598                         *pErrorCode = U_INVALID_CHAR_FOUND;
    599                         return NULL;
    600                     }else if(ch<=0xFFFF){
    601                         *(pDest++)=(UChar)ch;
    602                     }else{
    603                         *(pDest++)=UTF16_LEAD(ch);
    604                         *(pDest++)=UTF16_TRAIL(ch);
    605                     }
    606                 }
    607             } while(--count > 0);
    608         }
    609 
    610         while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
    611             ch = *pSrc;
    612             if(ch <= 0x7f){
    613                 *pDest++=(UChar)ch;
    614                 ++pSrc;
    615             } else {
    616                 if(ch > 0xe0) {
    617                     if( /* handle U+1000..U+CFFF inline */
    618                         ch <= 0xec &&
    619                         ((pSrcLimit - pSrc) >= 3) &&
    620                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
    621                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
    622                     ) {
    623                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    624                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
    625                         pSrc += 3;
    626                         continue;
    627                     }
    628                 } else if(ch < 0xe0) {
    629                     if( /* handle U+0080..U+07FF inline */
    630                         ch >= 0xc2 &&
    631                         ((pSrcLimit - pSrc) >= 2) &&
    632                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
    633                     ) {
    634                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
    635                         pSrc += 2;
    636                         continue;
    637                     }
    638                 }
    639 
    640                 /* function call for "complicated" and error cases */
    641                 ++pSrc; /* continue after the lead byte */
    642                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
    643                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
    644                     *pErrorCode = U_INVALID_CHAR_FOUND;
    645                     return NULL;
    646                 }else if(ch<=0xFFFF){
    647                     *(pDest++)=(UChar)ch;
    648                 }else{
    649                     *(pDest++)=UTF16_LEAD(ch);
    650                     if(pDest<pDestLimit){
    651                         *(pDest++)=UTF16_TRAIL(ch);
    652                     }else{
    653                         reqLength++;
    654                         break;
    655                     }
    656                 }
    657             }
    658         }
    659         /* do not fill the dest buffer just count the UChars needed */
    660         while(pSrc < pSrcLimit){
    661             ch = *pSrc;
    662             if(ch <= 0x7f){
    663                 reqLength++;
    664                 ++pSrc;
    665             } else {
    666                 if(ch > 0xe0) {
    667                     if( /* handle U+1000..U+CFFF inline */
    668                         ch <= 0xec &&
    669                         ((pSrcLimit - pSrc) >= 3) &&
    670                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
    671                         (uint8_t)(pSrc[2] - 0x80) <= 0x3f
    672                     ) {
    673                         reqLength++;
    674                         pSrc += 3;
    675                         continue;
    676                     }
    677                 } else if(ch < 0xe0) {
    678                     if( /* handle U+0080..U+07FF inline */
    679                         ch >= 0xc2 &&
    680                         ((pSrcLimit - pSrc) >= 2) &&
    681                         (uint8_t)(pSrc[1] - 0x80) <= 0x3f
    682                     ) {
    683                         reqLength++;
    684                         pSrc += 2;
    685                         continue;
    686                     }
    687                 }
    688 
    689                 /* function call for "complicated" and error cases */
    690                 ++pSrc; /* continue after the lead byte */
    691                 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
    692                 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){
    693                     *pErrorCode = U_INVALID_CHAR_FOUND;
    694                     return NULL;
    695                 }
    696                 reqLength+=UTF_CHAR_LENGTH(ch);
    697             }
    698         }
    699     }
    700 
    701     reqLength+=(int32_t)(pDest - dest);
    702 
    703     if(pNumSubstitutions!=NULL) {
    704         *pNumSubstitutions=numSubstitutions;
    705     }
    706 
    707     if(pDestLength){
    708         *pDestLength = reqLength;
    709     }
    710 
    711     /* Terminate the buffer */
    712     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
    713 
    714     return dest;
    715 }
    716 
    717 U_CAPI UChar* U_EXPORT2
    718 u_strFromUTF8(UChar *dest,
    719               int32_t destCapacity,
    720               int32_t *pDestLength,
    721               const char* src,
    722               int32_t srcLength,
    723               UErrorCode *pErrorCode){
    724     return u_strFromUTF8WithSub(
    725             dest, destCapacity, pDestLength,
    726             src, srcLength,
    727             U_SENTINEL, NULL,
    728             pErrorCode);
    729 }
    730 
    731 U_CAPI UChar * U_EXPORT2
    732 u_strFromUTF8Lenient(UChar *dest,
    733                      int32_t destCapacity,
    734                      int32_t *pDestLength,
    735                      const char *src,
    736                      int32_t srcLength,
    737                      UErrorCode *pErrorCode) {
    738     UChar *pDest = dest;
    739     UChar32 ch;
    740     int32_t reqLength = 0;
    741     uint8_t* pSrc = (uint8_t*) src;
    742 
    743     /* args check */
    744     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
    745         return NULL;
    746     }
    747 
    748     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
    749         (destCapacity<0) || (dest == NULL && destCapacity > 0)
    750     ) {
    751         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    752         return NULL;
    753     }
    754 
    755     if(srcLength < 0) {
    756         /* Transform a NUL-terminated string. */
    757         UChar *pDestLimit = dest+destCapacity;
    758         uint8_t t1, t2, t3; /* trail bytes */
    759 
    760         while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) {
    761             if(ch < 0xc0) {
    762                 /*
    763                  * ASCII, or a trail byte in lead position which is treated like
    764                  * a single-byte sequence for better character boundary
    765                  * resynchronization after illegal sequences.
    766                  */
    767                 *pDest++=(UChar)ch;
    768                 ++pSrc;
    769                 continue;
    770             } else if(ch < 0xe0) { /* U+0080..U+07FF */
    771                 if((t1 = pSrc[1]) != 0) {
    772                     /* 0x3080 = (0xc0 << 6) + 0x80 */
    773                     *pDest++ = (UChar)((ch << 6) + t1 - 0x3080);
    774                     pSrc += 2;
    775                     continue;
    776                 }
    777             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    778                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) {
    779                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    780                     /* 0x2080 = (0x80 << 6) + 0x80 */
    781                     *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080);
    782                     pSrc += 3;
    783                     continue;
    784                 }
    785             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    786                 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) {
    787                     pSrc += 4;
    788                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    789                     ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080;
    790                     *(pDest++) = U16_LEAD(ch);
    791                     if(pDest < pDestLimit) {
    792                         *(pDest++) = U16_TRAIL(ch);
    793                     } else {
    794                         reqLength = 1;
    795                         break;
    796                     }
    797                     continue;
    798                 }
    799             }
    800 
    801             /* truncated character at the end */
    802             *pDest++ = 0xfffd;
    803             while(*++pSrc != 0) {}
    804             break;
    805         }
    806 
    807         /* Pre-flight the rest of the string. */
    808         while((ch = *pSrc) != 0) {
    809             if(ch < 0xc0) {
    810                 /*
    811                  * ASCII, or a trail byte in lead position which is treated like
    812                  * a single-byte sequence for better character boundary
    813                  * resynchronization after illegal sequences.
    814                  */
    815                 ++reqLength;
    816                 ++pSrc;
    817                 continue;
    818             } else if(ch < 0xe0) { /* U+0080..U+07FF */
    819                 if(pSrc[1] != 0) {
    820                     ++reqLength;
    821                     pSrc += 2;
    822                     continue;
    823                 }
    824             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    825                 if(pSrc[1] != 0 && pSrc[2] != 0) {
    826                     ++reqLength;
    827                     pSrc += 3;
    828                     continue;
    829                 }
    830             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    831                 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) {
    832                     reqLength += 2;
    833                     pSrc += 4;
    834                     continue;
    835                 }
    836             }
    837 
    838             /* truncated character at the end */
    839             ++reqLength;
    840             break;
    841         }
    842     } else /* srcLength >= 0 */ {
    843         const uint8_t *pSrcLimit = pSrc + srcLength;
    844 
    845         /*
    846          * This function requires that if srcLength is given, then it must be
    847          * destCapatity >= srcLength so that we need not check for
    848          * destination buffer overflow in the loop.
    849          */
    850         if(destCapacity < srcLength) {
    851             if(pDestLength != NULL) {
    852                 *pDestLength = srcLength; /* this likely overestimates the true destLength! */
    853             }
    854             *pErrorCode = U_BUFFER_OVERFLOW_ERROR;
    855             return NULL;
    856         }
    857 
    858         if((pSrcLimit - pSrc) >= 4) {
    859             pSrcLimit -= 3; /* temporarily reduce pSrcLimit */
    860 
    861             /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */
    862             do {
    863                 ch = *pSrc++;
    864                 if(ch < 0xc0) {
    865                     /*
    866                      * ASCII, or a trail byte in lead position which is treated like
    867                      * a single-byte sequence for better character boundary
    868                      * resynchronization after illegal sequences.
    869                      */
    870                     *pDest++=(UChar)ch;
    871                 } else if(ch < 0xe0) { /* U+0080..U+07FF */
    872                     /* 0x3080 = (0xc0 << 6) + 0x80 */
    873                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
    874                 } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    875                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    876                     /* 0x2080 = (0x80 << 6) + 0x80 */
    877                     ch = (ch << 12) + (*pSrc++ << 6);
    878                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
    879                 } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    880                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    881                     ch = (ch << 18) + (*pSrc++ << 12);
    882                     ch += *pSrc++ << 6;
    883                     ch += *pSrc++ - 0x3c82080;
    884                     *(pDest++) = U16_LEAD(ch);
    885                     *(pDest++) = U16_TRAIL(ch);
    886                 }
    887             } while(pSrc < pSrcLimit);
    888 
    889             pSrcLimit += 3; /* restore original pSrcLimit */
    890         }
    891 
    892         while(pSrc < pSrcLimit) {
    893             ch = *pSrc++;
    894             if(ch < 0xc0) {
    895                 /*
    896                  * ASCII, or a trail byte in lead position which is treated like
    897                  * a single-byte sequence for better character boundary
    898                  * resynchronization after illegal sequences.
    899                  */
    900                 *pDest++=(UChar)ch;
    901                 continue;
    902             } else if(ch < 0xe0) { /* U+0080..U+07FF */
    903                 if(pSrc < pSrcLimit) {
    904                     /* 0x3080 = (0xc0 << 6) + 0x80 */
    905                     *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080);
    906                     continue;
    907                 }
    908             } else if(ch < 0xf0) { /* U+0800..U+FFFF */
    909                 if((pSrcLimit - pSrc) >= 2) {
    910                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
    911                     /* 0x2080 = (0x80 << 6) + 0x80 */
    912                     ch = (ch << 12) + (*pSrc++ << 6);
    913                     *pDest++ = (UChar)(ch + *pSrc++ - 0x2080);
    914                     pSrc += 3;
    915                     continue;
    916                 }
    917             } else /* f0..f4 */ { /* U+10000..U+10FFFF */
    918                 if((pSrcLimit - pSrc) >= 3) {
    919                     /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    920                     ch = (ch << 18) + (*pSrc++ << 12);
    921                     ch += *pSrc++ << 6;
    922                     ch += *pSrc++ - 0x3c82080;
    923                     *(pDest++) = U16_LEAD(ch);
    924                     *(pDest++) = U16_TRAIL(ch);
    925                     pSrc += 4;
    926                     continue;
    927                 }
    928             }
    929 
    930             /* truncated character at the end */
    931             *pDest++ = 0xfffd;
    932             break;
    933         }
    934     }
    935 
    936     reqLength+=(int32_t)(pDest - dest);
    937 
    938     if(pDestLength){
    939         *pDestLength = reqLength;
    940     }
    941 
    942     /* Terminate the buffer */
    943     u_terminateUChars(dest,destCapacity,reqLength,pErrorCode);
    944 
    945     return dest;
    946 }
    947 
    948 static U_INLINE uint8_t *
    949 _appendUTF8(uint8_t *pDest, UChar32 c) {
    950     /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */
    951     if((c)<=0x7f) {
    952         *pDest++=(uint8_t)c;
    953     } else if(c<=0x7ff) {
    954         *pDest++=(uint8_t)((c>>6)|0xc0);
    955         *pDest++=(uint8_t)((c&0x3f)|0x80);
    956     } else if(c<=0xffff) {
    957         *pDest++=(uint8_t)((c>>12)|0xe0);
    958         *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80);
    959         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
    960     } else /* if((uint32_t)(c)<=0x10ffff) */ {
    961         *pDest++=(uint8_t)(((c)>>18)|0xf0);
    962         *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80);
    963         *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80);
    964         *pDest++=(uint8_t)(((c)&0x3f)|0x80);
    965     }
    966     return pDest;
    967 }
    968 
    969 
    970 U_CAPI char* U_EXPORT2
    971 u_strToUTF8WithSub(char *dest,
    972             int32_t destCapacity,
    973             int32_t *pDestLength,
    974             const UChar *pSrc,
    975             int32_t srcLength,
    976             UChar32 subchar, int32_t *pNumSubstitutions,
    977             UErrorCode *pErrorCode){
    978     int32_t reqLength=0;
    979     uint32_t ch=0,ch2=0;
    980     uint8_t *pDest = (uint8_t *)dest;
    981     uint8_t *pDestLimit = pDest + destCapacity;
    982     int32_t numSubstitutions;
    983 
    984     /* args check */
    985     if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){
    986         return NULL;
    987     }
    988 
    989     if( (pSrc==NULL && srcLength!=0) || srcLength < -1 ||
    990         (destCapacity<0) || (dest == NULL && destCapacity > 0) ||
    991         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
    992     ) {
    993         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
    994         return NULL;
    995     }
    996 
    997     if(pNumSubstitutions!=NULL) {
    998         *pNumSubstitutions=0;
    999     }
   1000     numSubstitutions=0;
   1001 
   1002     if(srcLength==-1) {
   1003         while((ch=*pSrc)!=0) {
   1004             ++pSrc;
   1005             if(ch <= 0x7f) {
   1006                 if(pDest<pDestLimit) {
   1007                     *pDest++ = (uint8_t)ch;
   1008                 } else {
   1009                     reqLength = 1;
   1010                     break;
   1011                 }
   1012             } else if(ch <= 0x7ff) {
   1013                 if((pDestLimit - pDest) >= 2) {
   1014                     *pDest++=(uint8_t)((ch>>6)|0xc0);
   1015                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1016                 } else {
   1017                     reqLength = 2;
   1018                     break;
   1019                 }
   1020             } else if(ch <= 0xd7ff || ch >= 0xe000) {
   1021                 if((pDestLimit - pDest) >= 3) {
   1022                     *pDest++=(uint8_t)((ch>>12)|0xe0);
   1023                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1024                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1025                 } else {
   1026                     reqLength = 3;
   1027                     break;
   1028                 }
   1029             } else /* ch is a surrogate */ {
   1030                 int32_t length;
   1031 
   1032                 /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/
   1033                 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
   1034                     ++pSrc;
   1035                     ch=UTF16_GET_PAIR_VALUE(ch, ch2);
   1036                 } else if(subchar>=0) {
   1037                     ch=subchar;
   1038                     ++numSubstitutions;
   1039                 } else {
   1040                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1041                     *pErrorCode = U_INVALID_CHAR_FOUND;
   1042                     return NULL;
   1043                 }
   1044 
   1045                 length = U8_LENGTH(ch);
   1046                 if((pDestLimit - pDest) >= length) {
   1047                     /* convert and append*/
   1048                     pDest=_appendUTF8(pDest, ch);
   1049                 } else {
   1050                     reqLength = length;
   1051                     break;
   1052                 }
   1053             }
   1054         }
   1055         while((ch=*pSrc++)!=0) {
   1056             if(ch<=0x7f) {
   1057                 ++reqLength;
   1058             } else if(ch<=0x7ff) {
   1059                 reqLength+=2;
   1060             } else if(!UTF_IS_SURROGATE(ch)) {
   1061                 reqLength+=3;
   1062             } else if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
   1063                 ++pSrc;
   1064                 reqLength+=4;
   1065             } else if(subchar>=0) {
   1066                 reqLength+=U8_LENGTH(subchar);
   1067                 ++numSubstitutions;
   1068             } else {
   1069                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1070                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1071                 return NULL;
   1072             }
   1073         }
   1074     } else {
   1075         const UChar *pSrcLimit = pSrc+srcLength;
   1076         int32_t count;
   1077 
   1078         /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
   1079         for(;;) {
   1080             /*
   1081              * Each iteration of the inner loop progresses by at most 3 UTF-8
   1082              * bytes and one UChar, for most characters.
   1083              * For supplementary code points (4 & 2), which are rare,
   1084              * there is an additional adjustment.
   1085              */
   1086             count = (int32_t)((pDestLimit - pDest) / 3);
   1087             srcLength = (int32_t)(pSrcLimit - pSrc);
   1088             if(count > srcLength) {
   1089                 count = srcLength; /* min(remaining dest/3, remaining src) */
   1090             }
   1091             if(count < 3) {
   1092                 /*
   1093                  * Too much overhead if we get near the end of the string,
   1094                  * continue with the next loop.
   1095                  */
   1096                 break;
   1097             }
   1098             do {
   1099                 ch=*pSrc++;
   1100                 if(ch <= 0x7f) {
   1101                     *pDest++ = (uint8_t)ch;
   1102                 } else if(ch <= 0x7ff) {
   1103                     *pDest++=(uint8_t)((ch>>6)|0xc0);
   1104                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1105                 } else if(ch <= 0xd7ff || ch >= 0xe000) {
   1106                     *pDest++=(uint8_t)((ch>>12)|0xe0);
   1107                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1108                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1109                 } else /* ch is a surrogate */ {
   1110                     /*
   1111                      * We will read two UChars and probably output four bytes,
   1112                      * which we didn't account for with computing count,
   1113                      * so we adjust it here.
   1114                      */
   1115                     if(--count == 0) {
   1116                         --pSrc; /* undo ch=*pSrc++ for the lead surrogate */
   1117                         break;  /* recompute count */
   1118                     }
   1119 
   1120                     if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) {
   1121                         ++pSrc;
   1122                         ch=UTF16_GET_PAIR_VALUE(ch, ch2);
   1123 
   1124                         /* writing 4 bytes per 2 UChars is ok */
   1125                         *pDest++=(uint8_t)((ch>>18)|0xf0);
   1126                         *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80);
   1127                         *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1128                         *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1129                     } else  {
   1130                         /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1131                         if(subchar>=0) {
   1132                             ch=subchar;
   1133                             ++numSubstitutions;
   1134                         } else {
   1135                             *pErrorCode = U_INVALID_CHAR_FOUND;
   1136                             return NULL;
   1137                         }
   1138 
   1139                         /* convert and append*/
   1140                         pDest=_appendUTF8(pDest, ch);
   1141                     }
   1142                 }
   1143             } while(--count > 0);
   1144         }
   1145 
   1146         while(pSrc<pSrcLimit) {
   1147             ch=*pSrc++;
   1148             if(ch <= 0x7f) {
   1149                 if(pDest<pDestLimit) {
   1150                     *pDest++ = (uint8_t)ch;
   1151                 } else {
   1152                     reqLength = 1;
   1153                     break;
   1154                 }
   1155             } else if(ch <= 0x7ff) {
   1156                 if((pDestLimit - pDest) >= 2) {
   1157                     *pDest++=(uint8_t)((ch>>6)|0xc0);
   1158                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1159                 } else {
   1160                     reqLength = 2;
   1161                     break;
   1162                 }
   1163             } else if(ch <= 0xd7ff || ch >= 0xe000) {
   1164                 if((pDestLimit - pDest) >= 3) {
   1165                     *pDest++=(uint8_t)((ch>>12)|0xe0);
   1166                     *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1167                     *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1168                 } else {
   1169                     reqLength = 3;
   1170                     break;
   1171                 }
   1172             } else /* ch is a surrogate */ {
   1173                 int32_t length;
   1174 
   1175                 if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
   1176                     ++pSrc;
   1177                     ch=UTF16_GET_PAIR_VALUE(ch, ch2);
   1178                 } else if(subchar>=0) {
   1179                     ch=subchar;
   1180                     ++numSubstitutions;
   1181                 } else {
   1182                     /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1183                     *pErrorCode = U_INVALID_CHAR_FOUND;
   1184                     return NULL;
   1185                 }
   1186 
   1187                 length = U8_LENGTH(ch);
   1188                 if((pDestLimit - pDest) >= length) {
   1189                     /* convert and append*/
   1190                     pDest=_appendUTF8(pDest, ch);
   1191                 } else {
   1192                     reqLength = length;
   1193                     break;
   1194                 }
   1195             }
   1196         }
   1197         while(pSrc<pSrcLimit) {
   1198             ch=*pSrc++;
   1199             if(ch<=0x7f) {
   1200                 ++reqLength;
   1201             } else if(ch<=0x7ff) {
   1202                 reqLength+=2;
   1203             } else if(!UTF_IS_SURROGATE(ch)) {
   1204                 reqLength+=3;
   1205             } else if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) {
   1206                 ++pSrc;
   1207                 reqLength+=4;
   1208             } else if(subchar>=0) {
   1209                 reqLength+=U8_LENGTH(subchar);
   1210                 ++numSubstitutions;
   1211             } else {
   1212                 /* Unicode 3.2 forbids surrogate code points in UTF-8 */
   1213                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1214                 return NULL;
   1215             }
   1216         }
   1217     }
   1218 
   1219     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
   1220 
   1221     if(pNumSubstitutions!=NULL) {
   1222         *pNumSubstitutions=numSubstitutions;
   1223     }
   1224 
   1225     if(pDestLength){
   1226         *pDestLength = reqLength;
   1227     }
   1228 
   1229     /* Terminate the buffer */
   1230     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1231     return dest;
   1232 }
   1233 
   1234 U_CAPI char* U_EXPORT2
   1235 u_strToUTF8(char *dest,
   1236             int32_t destCapacity,
   1237             int32_t *pDestLength,
   1238             const UChar *pSrc,
   1239             int32_t srcLength,
   1240             UErrorCode *pErrorCode){
   1241     return u_strToUTF8WithSub(
   1242             dest, destCapacity, pDestLength,
   1243             pSrc, srcLength,
   1244             U_SENTINEL, NULL,
   1245             pErrorCode);
   1246 }
   1247 
   1248 U_CAPI UChar* U_EXPORT2
   1249 u_strFromJavaModifiedUTF8WithSub(
   1250         UChar *dest,
   1251         int32_t destCapacity,
   1252         int32_t *pDestLength,
   1253         const char *src,
   1254         int32_t srcLength,
   1255         UChar32 subchar, int32_t *pNumSubstitutions,
   1256         UErrorCode *pErrorCode) {
   1257     UChar *pDest = dest;
   1258     UChar *pDestLimit = dest+destCapacity;
   1259     UChar32 ch;
   1260     int32_t reqLength = 0;
   1261     const uint8_t* pSrc = (const uint8_t*) src;
   1262     const uint8_t *pSrcLimit;
   1263     int32_t count;
   1264     uint8_t t1, t2; /* trail bytes */
   1265     int32_t numSubstitutions;
   1266 
   1267     /* args check */
   1268     if(U_FAILURE(*pErrorCode)){
   1269         return NULL;
   1270     }
   1271     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
   1272         (dest==NULL && destCapacity!=0) || destCapacity<0 ||
   1273         subchar > 0x10ffff || U_IS_SURROGATE(subchar)
   1274     ) {
   1275         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1276         return NULL;
   1277     }
   1278 
   1279     if(pNumSubstitutions!=NULL) {
   1280         *pNumSubstitutions=0;
   1281     }
   1282     numSubstitutions=0;
   1283 
   1284     if(srcLength < 0) {
   1285         /*
   1286          * Transform a NUL-terminated ASCII string.
   1287          * Handle non-ASCII strings with slower code.
   1288          */
   1289         while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) {
   1290             *pDest++=(UChar)ch;
   1291             ++pSrc;
   1292         }
   1293         if(ch == 0) {
   1294             reqLength=(int32_t)(pDest - dest);
   1295             if(pDestLength) {
   1296                 *pDestLength = reqLength;
   1297             }
   1298 
   1299             /* Terminate the buffer */
   1300             u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
   1301             return dest;
   1302         }
   1303         srcLength = uprv_strlen((const char *)pSrc);
   1304     }
   1305 
   1306     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
   1307     pSrcLimit = pSrc + srcLength;
   1308     for(;;) {
   1309         count = (int32_t)(pDestLimit - pDest);
   1310         srcLength = (int32_t)(pSrcLimit - pSrc);
   1311         if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) {
   1312             /* fast ASCII loop */
   1313             const uint8_t *prevSrc = pSrc;
   1314             int32_t delta;
   1315             while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) {
   1316                 *pDest++=(UChar)ch;
   1317                 ++pSrc;
   1318             }
   1319             delta = (int32_t)(pSrc - prevSrc);
   1320             count -= delta;
   1321             srcLength -= delta;
   1322         }
   1323         /*
   1324          * Each iteration of the inner loop progresses by at most 3 UTF-8
   1325          * bytes and one UChar.
   1326          */
   1327         srcLength /= 3;
   1328         if(count > srcLength) {
   1329             count = srcLength; /* min(remaining dest, remaining src/3) */
   1330         }
   1331         if(count < 3) {
   1332             /*
   1333              * Too much overhead if we get near the end of the string,
   1334              * continue with the next loop.
   1335              */
   1336             break;
   1337         }
   1338         do {
   1339             ch = *pSrc;
   1340             if(ch <= 0x7f){
   1341                 *pDest++=(UChar)ch;
   1342                 ++pSrc;
   1343             } else {
   1344                 if(ch >= 0xe0) {
   1345                     if( /* handle U+0000..U+FFFF inline */
   1346                         ch <= 0xef &&
   1347                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
   1348                         (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
   1349                     ) {
   1350                         /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
   1351                         *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
   1352                         pSrc += 3;
   1353                         continue;
   1354                     }
   1355                 } else {
   1356                     if( /* handle U+0000..U+07FF inline */
   1357                         ch >= 0xc0 &&
   1358                         (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
   1359                     ) {
   1360                         *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
   1361                         pSrc += 2;
   1362                         continue;
   1363                     }
   1364                 }
   1365 
   1366                 if(subchar < 0) {
   1367                     *pErrorCode = U_INVALID_CHAR_FOUND;
   1368                     return NULL;
   1369                 } else if(subchar > 0xffff && --count == 0) {
   1370                     /*
   1371                      * We need to write two UChars, adjusted count for that,
   1372                      * and ran out of space.
   1373                      */
   1374                     break;
   1375                 } else {
   1376                     /* function call for error cases */
   1377                     ++pSrc; /* continue after the lead byte */
   1378                     utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
   1379                     ++numSubstitutions;
   1380                     if(subchar<=0xFFFF) {
   1381                         *(pDest++)=(UChar)subchar;
   1382                     } else {
   1383                         *(pDest++)=U16_LEAD(subchar);
   1384                         *(pDest++)=U16_TRAIL(subchar);
   1385                     }
   1386                 }
   1387             }
   1388         } while(--count > 0);
   1389     }
   1390 
   1391     while((pSrc<pSrcLimit) && (pDest<pDestLimit)) {
   1392         ch = *pSrc;
   1393         if(ch <= 0x7f){
   1394             *pDest++=(UChar)ch;
   1395             ++pSrc;
   1396         } else {
   1397             if(ch >= 0xe0) {
   1398                 if( /* handle U+0000..U+FFFF inline */
   1399                     ch <= 0xef &&
   1400                     ((pSrcLimit - pSrc) >= 3) &&
   1401                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f &&
   1402                     (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f
   1403                 ) {
   1404                     /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */
   1405                     *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2);
   1406                     pSrc += 3;
   1407                     continue;
   1408                 }
   1409             } else {
   1410                 if( /* handle U+0000..U+07FF inline */
   1411                     ch >= 0xc0 &&
   1412                     ((pSrcLimit - pSrc) >= 2) &&
   1413                     (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f
   1414                 ) {
   1415                     *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1);
   1416                     pSrc += 2;
   1417                     continue;
   1418                 }
   1419             }
   1420 
   1421             if(subchar < 0) {
   1422                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1423                 return NULL;
   1424             } else {
   1425                 /* function call for error cases */
   1426                 ++pSrc; /* continue after the lead byte */
   1427                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
   1428                 ++numSubstitutions;
   1429                 if(subchar<=0xFFFF) {
   1430                     *(pDest++)=(UChar)subchar;
   1431                 } else {
   1432                     *(pDest++)=U16_LEAD(subchar);
   1433                     if(pDest<pDestLimit) {
   1434                         *(pDest++)=U16_TRAIL(subchar);
   1435                     } else {
   1436                         reqLength++;
   1437                         break;
   1438                     }
   1439                 }
   1440             }
   1441         }
   1442     }
   1443 
   1444     /* do not fill the dest buffer just count the UChars needed */
   1445     while(pSrc < pSrcLimit){
   1446         ch = *pSrc;
   1447         if(ch <= 0x7f) {
   1448             reqLength++;
   1449             ++pSrc;
   1450         } else {
   1451             if(ch >= 0xe0) {
   1452                 if( /* handle U+0000..U+FFFF inline */
   1453                     ch <= 0xef &&
   1454                     ((pSrcLimit - pSrc) >= 3) &&
   1455                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f &&
   1456                     (uint8_t)(pSrc[2] - 0x80) <= 0x3f
   1457                 ) {
   1458                     reqLength++;
   1459                     pSrc += 3;
   1460                     continue;
   1461                 }
   1462             } else {
   1463                 if( /* handle U+0000..U+07FF inline */
   1464                     ch >= 0xc0 &&
   1465                     ((pSrcLimit - pSrc) >= 2) &&
   1466                     (uint8_t)(pSrc[1] - 0x80) <= 0x3f
   1467                 ) {
   1468                     reqLength++;
   1469                     pSrc += 2;
   1470                     continue;
   1471                 }
   1472             }
   1473 
   1474             if(subchar < 0) {
   1475                 *pErrorCode = U_INVALID_CHAR_FOUND;
   1476                 return NULL;
   1477             } else {
   1478                 /* function call for error cases */
   1479                 ++pSrc; /* continue after the lead byte */
   1480                 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch);
   1481                 ++numSubstitutions;
   1482                 reqLength+=U16_LENGTH(ch);
   1483             }
   1484         }
   1485     }
   1486 
   1487     if(pNumSubstitutions!=NULL) {
   1488         *pNumSubstitutions=numSubstitutions;
   1489     }
   1490 
   1491     reqLength+=(int32_t)(pDest - dest);
   1492     if(pDestLength) {
   1493         *pDestLength = reqLength;
   1494     }
   1495 
   1496     /* Terminate the buffer */
   1497     u_terminateUChars(dest, destCapacity, reqLength, pErrorCode);
   1498     return dest;
   1499 }
   1500 
   1501 U_CAPI char* U_EXPORT2
   1502 u_strToJavaModifiedUTF8(
   1503         char *dest,
   1504         int32_t destCapacity,
   1505         int32_t *pDestLength,
   1506         const UChar *src,
   1507         int32_t srcLength,
   1508         UErrorCode *pErrorCode) {
   1509     int32_t reqLength=0;
   1510     uint32_t ch=0;
   1511     uint8_t *pDest = (uint8_t *)dest;
   1512     uint8_t *pDestLimit = pDest + destCapacity;
   1513     const UChar *pSrcLimit;
   1514     int32_t count;
   1515 
   1516     /* args check */
   1517     if(U_FAILURE(*pErrorCode)){
   1518         return NULL;
   1519     }
   1520     if( (src==NULL && srcLength!=0) || srcLength < -1 ||
   1521         (dest==NULL && destCapacity!=0) || destCapacity<0
   1522     ) {
   1523         *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
   1524         return NULL;
   1525     }
   1526 
   1527     if(srcLength==-1) {
   1528         /* Convert NUL-terminated ASCII, then find the string length. */
   1529         while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) {
   1530             *pDest++ = (uint8_t)ch;
   1531             ++src;
   1532         }
   1533         if(ch == 0) {
   1534             reqLength=(int32_t)(pDest - (uint8_t *)dest);
   1535             if(pDestLength) {
   1536                 *pDestLength = reqLength;
   1537             }
   1538 
   1539             /* Terminate the buffer */
   1540             u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1541             return dest;
   1542         }
   1543         srcLength = u_strlen(src);
   1544     }
   1545 
   1546     /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */
   1547     pSrcLimit = src+srcLength;
   1548     for(;;) {
   1549         count = (int32_t)(pDestLimit - pDest);
   1550         srcLength = (int32_t)(pSrcLimit - src);
   1551         if(count >= srcLength && srcLength > 0 && *src <= 0x7f) {
   1552             /* fast ASCII loop */
   1553             const UChar *prevSrc = src;
   1554             int32_t delta;
   1555             while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) {
   1556                 *pDest++=(uint8_t)ch;
   1557                 ++src;
   1558             }
   1559             delta = (int32_t)(src - prevSrc);
   1560             count -= delta;
   1561             srcLength -= delta;
   1562         }
   1563         /*
   1564          * Each iteration of the inner loop progresses by at most 3 UTF-8
   1565          * bytes and one UChar.
   1566          */
   1567         count /= 3;
   1568         if(count > srcLength) {
   1569             count = srcLength; /* min(remaining dest/3, remaining src) */
   1570         }
   1571         if(count < 3) {
   1572             /*
   1573              * Too much overhead if we get near the end of the string,
   1574              * continue with the next loop.
   1575              */
   1576             break;
   1577         }
   1578         do {
   1579             ch=*src++;
   1580             if(ch <= 0x7f && ch != 0) {
   1581                 *pDest++ = (uint8_t)ch;
   1582             } else if(ch <= 0x7ff) {
   1583                 *pDest++=(uint8_t)((ch>>6)|0xc0);
   1584                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1585             } else {
   1586                 *pDest++=(uint8_t)((ch>>12)|0xe0);
   1587                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1588                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1589             }
   1590         } while(--count > 0);
   1591     }
   1592 
   1593     while(src<pSrcLimit) {
   1594         ch=*src++;
   1595         if(ch <= 0x7f && ch != 0) {
   1596             if(pDest<pDestLimit) {
   1597                 *pDest++ = (uint8_t)ch;
   1598             } else {
   1599                 reqLength = 1;
   1600                 break;
   1601             }
   1602         } else if(ch <= 0x7ff) {
   1603             if((pDestLimit - pDest) >= 2) {
   1604                 *pDest++=(uint8_t)((ch>>6)|0xc0);
   1605                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1606             } else {
   1607                 reqLength = 2;
   1608                 break;
   1609             }
   1610         } else {
   1611             if((pDestLimit - pDest) >= 3) {
   1612                 *pDest++=(uint8_t)((ch>>12)|0xe0);
   1613                 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80);
   1614                 *pDest++=(uint8_t)((ch&0x3f)|0x80);
   1615             } else {
   1616                 reqLength = 3;
   1617                 break;
   1618             }
   1619         }
   1620     }
   1621     while(src<pSrcLimit) {
   1622         ch=*src++;
   1623         if(ch <= 0x7f && ch != 0) {
   1624             ++reqLength;
   1625         } else if(ch<=0x7ff) {
   1626             reqLength+=2;
   1627         } else {
   1628             reqLength+=3;
   1629         }
   1630     }
   1631 
   1632     reqLength+=(int32_t)(pDest - (uint8_t *)dest);
   1633     if(pDestLength){
   1634         *pDestLength = reqLength;
   1635     }
   1636 
   1637     /* Terminate the buffer */
   1638     u_terminateChars(dest, destCapacity, reqLength, pErrorCode);
   1639     return dest;
   1640 }
   1641