Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2002-2015, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   file name:  ucnv_u16.c
      9 *   encoding:   UTF-8
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002jul01
     14 *   created by: Markus W. Scherer
     15 *
     16 *   UTF-16 converter implementation. Used to be in ucnv_utf.c.
     17 */
     18 
     19 #include "unicode/utypes.h"
     20 
     21 #if !UCONFIG_NO_CONVERSION
     22 
     23 #include "unicode/ucnv.h"
     24 #include "unicode/uversion.h"
     25 #include "ucnv_bld.h"
     26 #include "ucnv_cnv.h"
     27 #include "cmemory.h"
     28 
     29 enum {
     30     UCNV_NEED_TO_WRITE_BOM=1
     31 };
     32 
     33 U_CDECL_BEGIN
     34 /*
     35  * The UTF-16 toUnicode implementation is also used for the Java-specific
     36  * "with BOM" variants of UTF-16BE and UTF-16LE.
     37  */
     38 static void  U_CALLCONV
     39 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
     40                            UErrorCode *pErrorCode);
     41 
     42 /* UTF-16BE ----------------------------------------------------------------- */
     43 
     44 #if U_IS_BIG_ENDIAN
     45 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16BEFromUnicodeWithOffsets
     46 #else
     47 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16LEFromUnicodeWithOffsets
     48 #endif
     49 
     50 
     51 static void  U_CALLCONV
     52 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
     53                                UErrorCode *pErrorCode) {
     54     UConverter *cnv;
     55     const UChar *source;
     56     char *target;
     57     int32_t *offsets;
     58 
     59     uint32_t targetCapacity, length, sourceIndex;
     60     UChar c, trail;
     61     char overflow[4];
     62 
     63     source=pArgs->source;
     64     length=(int32_t)(pArgs->sourceLimit-source);
     65     if(length<=0) {
     66         /* no input, nothing to do */
     67         return;
     68     }
     69 
     70     cnv=pArgs->converter;
     71 
     72     /* write the BOM if necessary */
     73     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
     74         static const char bom[]={ (char)0xfe, (char)0xff };
     75         ucnv_fromUWriteBytes(cnv,
     76                              bom, 2,
     77                              &pArgs->target, pArgs->targetLimit,
     78                              &pArgs->offsets, -1,
     79                              pErrorCode);
     80         cnv->fromUnicodeStatus=0;
     81     }
     82 
     83     target=pArgs->target;
     84     if(target >= pArgs->targetLimit) {
     85         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
     86         return;
     87     }
     88 
     89     targetCapacity=(uint32_t)(pArgs->targetLimit-target);
     90     offsets=pArgs->offsets;
     91     sourceIndex=0;
     92 
     93     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
     94 
     95     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
     96         /* the last buffer ended with a lead surrogate, output the surrogate pair */
     97         ++source;
     98         --length;
     99         target[0]=(uint8_t)(c>>8);
    100         target[1]=(uint8_t)c;
    101         target[2]=(uint8_t)(trail>>8);
    102         target[3]=(uint8_t)trail;
    103         target+=4;
    104         targetCapacity-=4;
    105         if(offsets!=NULL) {
    106             *offsets++=-1;
    107             *offsets++=-1;
    108             *offsets++=-1;
    109             *offsets++=-1;
    110         }
    111         sourceIndex=1;
    112         cnv->fromUChar32=c=0;
    113     }
    114 
    115     if(c==0) {
    116         /* copy an even number of bytes for complete UChars */
    117         uint32_t count=2*length;
    118         if(count>targetCapacity) {
    119             count=targetCapacity&~1;
    120         }
    121         /* count is even */
    122         targetCapacity-=count;
    123         count>>=1;
    124         length-=count;
    125 
    126         if(offsets==NULL) {
    127             while(count>0) {
    128                 c=*source++;
    129                 if(U16_IS_SINGLE(c)) {
    130                     target[0]=(uint8_t)(c>>8);
    131                     target[1]=(uint8_t)c;
    132                     target+=2;
    133                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
    134                     ++source;
    135                     --count;
    136                     target[0]=(uint8_t)(c>>8);
    137                     target[1]=(uint8_t)c;
    138                     target[2]=(uint8_t)(trail>>8);
    139                     target[3]=(uint8_t)trail;
    140                     target+=4;
    141                 } else {
    142                     break;
    143                 }
    144                 --count;
    145             }
    146         } else {
    147             while(count>0) {
    148                 c=*source++;
    149                 if(U16_IS_SINGLE(c)) {
    150                     target[0]=(uint8_t)(c>>8);
    151                     target[1]=(uint8_t)c;
    152                     target+=2;
    153                     *offsets++=sourceIndex;
    154                     *offsets++=sourceIndex++;
    155                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
    156                     ++source;
    157                     --count;
    158                     target[0]=(uint8_t)(c>>8);
    159                     target[1]=(uint8_t)c;
    160                     target[2]=(uint8_t)(trail>>8);
    161                     target[3]=(uint8_t)trail;
    162                     target+=4;
    163                     *offsets++=sourceIndex;
    164                     *offsets++=sourceIndex;
    165                     *offsets++=sourceIndex;
    166                     *offsets++=sourceIndex;
    167                     sourceIndex+=2;
    168                 } else {
    169                     break;
    170                 }
    171                 --count;
    172             }
    173         }
    174 
    175         if(count==0) {
    176             /* done with the loop for complete UChars */
    177             if(length>0 && targetCapacity>0) {
    178                 /*
    179                  * there is more input and some target capacity -
    180                  * it must be targetCapacity==1 because otherwise
    181                  * the above would have copied more;
    182                  * prepare for overflow output
    183                  */
    184                 if(U16_IS_SINGLE(c=*source++)) {
    185                     overflow[0]=(char)(c>>8);
    186                     overflow[1]=(char)c;
    187                     length=2; /* 2 bytes to output */
    188                     c=0;
    189                 /* } else { keep c for surrogate handling, length will be set there */
    190                 }
    191             } else {
    192                 length=0;
    193                 c=0;
    194             }
    195         } else {
    196             /* keep c for surrogate handling, length will be set there */
    197             targetCapacity+=2*count;
    198         }
    199     } else {
    200         length=0; /* from here on, length counts the bytes in overflow[] */
    201     }
    202 
    203     if(c!=0) {
    204         /*
    205          * c is a surrogate, and
    206          * - source or target too short
    207          * - or the surrogate is unmatched
    208          */
    209         length=0;
    210         if(U16_IS_SURROGATE_LEAD(c)) {
    211             if(source<pArgs->sourceLimit) {
    212                 if(U16_IS_TRAIL(trail=*source)) {
    213                     /* output the surrogate pair, will overflow (see conditions comment above) */
    214                     ++source;
    215                     overflow[0]=(char)(c>>8);
    216                     overflow[1]=(char)c;
    217                     overflow[2]=(char)(trail>>8);
    218                     overflow[3]=(char)trail;
    219                     length=4; /* 4 bytes to output */
    220                     c=0;
    221                 } else {
    222                     /* unmatched lead surrogate */
    223                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    224                 }
    225             } else {
    226                 /* see if the trail surrogate is in the next buffer */
    227             }
    228         } else {
    229             /* unmatched trail surrogate */
    230             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    231         }
    232         cnv->fromUChar32=c;
    233     }
    234 
    235     if(length>0) {
    236         /* output length bytes with overflow (length>targetCapacity>0) */
    237         ucnv_fromUWriteBytes(cnv,
    238                              overflow, length,
    239                              (char **)&target, pArgs->targetLimit,
    240                              &offsets, sourceIndex,
    241                              pErrorCode);
    242         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
    243     }
    244 
    245     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
    246         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    247     }
    248 
    249     /* write back the updated pointers */
    250     pArgs->source=source;
    251     pArgs->target=(char *)target;
    252     pArgs->offsets=offsets;
    253 }
    254 
    255 static void  U_CALLCONV
    256 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    257                              UErrorCode *pErrorCode) {
    258     UConverter *cnv;
    259     const uint8_t *source;
    260     UChar *target;
    261     int32_t *offsets;
    262 
    263     uint32_t targetCapacity, length, count, sourceIndex;
    264     UChar c, trail;
    265 
    266     if(pArgs->converter->mode<8) {
    267         _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
    268         return;
    269     }
    270 
    271     cnv=pArgs->converter;
    272     source=(const uint8_t *)pArgs->source;
    273     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
    274     if(length<=0 && cnv->toUnicodeStatus==0) {
    275         /* no input, nothing to do */
    276         return;
    277     }
    278 
    279     target=pArgs->target;
    280     if(target >= pArgs->targetLimit) {
    281         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    282         return;
    283     }
    284 
    285     targetCapacity=(uint32_t)(pArgs->targetLimit-target);
    286     offsets=pArgs->offsets;
    287     sourceIndex=0;
    288     c=0;
    289 
    290     /* complete a partial UChar or pair from the last call */
    291     if(cnv->toUnicodeStatus!=0) {
    292         /*
    293          * special case: single byte from a previous buffer,
    294          * where the byte turned out not to belong to a trail surrogate
    295          * and the preceding, unmatched lead surrogate was put into toUBytes[]
    296          * for error handling
    297          */
    298         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
    299         cnv->toULength=1;
    300         cnv->toUnicodeStatus=0;
    301     }
    302     if((count=cnv->toULength)!=0) {
    303         uint8_t *p=cnv->toUBytes;
    304         do {
    305             p[count++]=*source++;
    306             ++sourceIndex;
    307             --length;
    308             if(count==2) {
    309                 c=((UChar)p[0]<<8)|p[1];
    310                 if(U16_IS_SINGLE(c)) {
    311                     /* output the BMP code point */
    312                     *target++=c;
    313                     if(offsets!=NULL) {
    314                         *offsets++=-1;
    315                     }
    316                     --targetCapacity;
    317                     count=0;
    318                     c=0;
    319                     break;
    320                 } else if(U16_IS_SURROGATE_LEAD(c)) {
    321                     /* continue collecting bytes for the trail surrogate */
    322                     c=0; /* avoid unnecessary surrogate handling below */
    323                 } else {
    324                     /* fall through to error handling for an unmatched trail surrogate */
    325                     break;
    326                 }
    327             } else if(count==4) {
    328                 c=((UChar)p[0]<<8)|p[1];
    329                 trail=((UChar)p[2]<<8)|p[3];
    330                 if(U16_IS_TRAIL(trail)) {
    331                     /* output the surrogate pair */
    332                     *target++=c;
    333                     if(targetCapacity>=2) {
    334                         *target++=trail;
    335                         if(offsets!=NULL) {
    336                             *offsets++=-1;
    337                             *offsets++=-1;
    338                         }
    339                         targetCapacity-=2;
    340                     } else /* targetCapacity==1 */ {
    341                         targetCapacity=0;
    342                         cnv->UCharErrorBuffer[0]=trail;
    343                         cnv->UCharErrorBufferLength=1;
    344                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    345                     }
    346                     count=0;
    347                     c=0;
    348                     break;
    349                 } else {
    350                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
    351                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    352 
    353                     /* back out reading the code unit after it */
    354                     if(((const uint8_t *)pArgs->source-source)>=2) {
    355                         source-=2;
    356                     } else {
    357                         /*
    358                          * if the trail unit's first byte was in a previous buffer, then
    359                          * we need to put it into a special place because toUBytes[] will be
    360                          * used for the lead unit's bytes
    361                          */
    362                         cnv->toUnicodeStatus=0x100|p[2];
    363                         --source;
    364                     }
    365                     cnv->toULength=2;
    366 
    367                     /* write back the updated pointers */
    368                     pArgs->source=(const char *)source;
    369                     pArgs->target=target;
    370                     pArgs->offsets=offsets;
    371                     return;
    372                 }
    373             }
    374         } while(length>0);
    375         cnv->toULength=(int8_t)count;
    376     }
    377 
    378     /* copy an even number of bytes for complete UChars */
    379     count=2*targetCapacity;
    380     if(count>length) {
    381         count=length&~1;
    382     }
    383     if(c==0 && count>0) {
    384         length-=count;
    385         count>>=1;
    386         targetCapacity-=count;
    387         if(offsets==NULL) {
    388             do {
    389                 c=((UChar)source[0]<<8)|source[1];
    390                 source+=2;
    391                 if(U16_IS_SINGLE(c)) {
    392                     *target++=c;
    393                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
    394                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
    395                 ) {
    396                     source+=2;
    397                     --count;
    398                     *target++=c;
    399                     *target++=trail;
    400                 } else {
    401                     break;
    402                 }
    403             } while(--count>0);
    404         } else {
    405             do {
    406                 c=((UChar)source[0]<<8)|source[1];
    407                 source+=2;
    408                 if(U16_IS_SINGLE(c)) {
    409                     *target++=c;
    410                     *offsets++=sourceIndex;
    411                     sourceIndex+=2;
    412                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
    413                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
    414                 ) {
    415                     source+=2;
    416                     --count;
    417                     *target++=c;
    418                     *target++=trail;
    419                     *offsets++=sourceIndex;
    420                     *offsets++=sourceIndex;
    421                     sourceIndex+=4;
    422                 } else {
    423                     break;
    424                 }
    425             } while(--count>0);
    426         }
    427 
    428         if(count==0) {
    429             /* done with the loop for complete UChars */
    430             c=0;
    431         } else {
    432             /* keep c for surrogate handling, trail will be set there */
    433             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
    434             targetCapacity+=count;
    435         }
    436     }
    437 
    438     if(c!=0) {
    439         /*
    440          * c is a surrogate, and
    441          * - source or target too short
    442          * - or the surrogate is unmatched
    443          */
    444         cnv->toUBytes[0]=(uint8_t)(c>>8);
    445         cnv->toUBytes[1]=(uint8_t)c;
    446         cnv->toULength=2;
    447 
    448         if(U16_IS_SURROGATE_LEAD(c)) {
    449             if(length>=2) {
    450                 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
    451                     /* output the surrogate pair, will overflow (see conditions comment above) */
    452                     source+=2;
    453                     length-=2;
    454                     *target++=c;
    455                     if(offsets!=NULL) {
    456                         *offsets++=sourceIndex;
    457                     }
    458                     cnv->UCharErrorBuffer[0]=trail;
    459                     cnv->UCharErrorBufferLength=1;
    460                     cnv->toULength=0;
    461                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    462                 } else {
    463                     /* unmatched lead surrogate */
    464                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    465                 }
    466             } else {
    467                 /* see if the trail surrogate is in the next buffer */
    468             }
    469         } else {
    470             /* unmatched trail surrogate */
    471             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    472         }
    473     }
    474 
    475     if(U_SUCCESS(*pErrorCode)) {
    476         /* check for a remaining source byte */
    477         if(length>0) {
    478             if(targetCapacity==0) {
    479                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    480             } else {
    481                 /* it must be length==1 because otherwise the above would have copied more */
    482                 cnv->toUBytes[cnv->toULength++]=*source++;
    483             }
    484         }
    485     }
    486 
    487     /* write back the updated pointers */
    488     pArgs->source=(const char *)source;
    489     pArgs->target=target;
    490     pArgs->offsets=offsets;
    491 }
    492 
    493 static UChar32  U_CALLCONV
    494 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
    495     const uint8_t *s, *sourceLimit;
    496     UChar32 c;
    497 
    498     if(pArgs->converter->mode<8) {
    499         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
    500     }
    501 
    502     s=(const uint8_t *)pArgs->source;
    503     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
    504 
    505     if(s>=sourceLimit) {
    506         /* no input */
    507         *err=U_INDEX_OUTOFBOUNDS_ERROR;
    508         return 0xffff;
    509     }
    510 
    511     if(s+2>sourceLimit) {
    512         /* only one byte: truncated UChar */
    513         pArgs->converter->toUBytes[0]=*s++;
    514         pArgs->converter->toULength=1;
    515         pArgs->source=(const char *)s;
    516         *err = U_TRUNCATED_CHAR_FOUND;
    517         return 0xffff;
    518     }
    519 
    520     /* get one UChar */
    521     c=((UChar32)*s<<8)|s[1];
    522     s+=2;
    523 
    524     /* check for a surrogate pair */
    525     if(U_IS_SURROGATE(c)) {
    526         if(U16_IS_SURROGATE_LEAD(c)) {
    527             if(s+2<=sourceLimit) {
    528                 UChar trail;
    529 
    530                 /* get a second UChar and see if it is a trail surrogate */
    531                 trail=((UChar)*s<<8)|s[1];
    532                 if(U16_IS_TRAIL(trail)) {
    533                     c=U16_GET_SUPPLEMENTARY(c, trail);
    534                     s+=2;
    535                 } else {
    536                     /* unmatched lead surrogate */
    537                     c=-2;
    538                 }
    539             } else {
    540                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
    541                 uint8_t *bytes=pArgs->converter->toUBytes;
    542                 s-=2;
    543                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
    544                 do {
    545                     *bytes++=*s++;
    546                 } while(s<sourceLimit);
    547 
    548                 c=0xffff;
    549                 *err=U_TRUNCATED_CHAR_FOUND;
    550             }
    551         } else {
    552             /* unmatched trail surrogate */
    553             c=-2;
    554         }
    555 
    556         if(c<0) {
    557             /* write the unmatched surrogate */
    558             uint8_t *bytes=pArgs->converter->toUBytes;
    559             pArgs->converter->toULength=2;
    560             *bytes=*(s-2);
    561             bytes[1]=*(s-1);
    562 
    563             c=0xffff;
    564             *err=U_ILLEGAL_CHAR_FOUND;
    565         }
    566     }
    567 
    568     pArgs->source=(const char *)s;
    569     return c;
    570 }
    571 
    572 static void  U_CALLCONV
    573 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
    574     if(choice<=UCNV_RESET_TO_UNICODE) {
    575         /* reset toUnicode state */
    576         if(UCNV_GET_VERSION(cnv)==0) {
    577             cnv->mode=8; /* no BOM handling */
    578         } else {
    579             cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
    580         }
    581     }
    582     if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
    583         /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
    584         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
    585     }
    586 }
    587 
    588 static void  U_CALLCONV
    589 _UTF16BEOpen(UConverter *cnv,
    590              UConverterLoadArgs *pArgs,
    591              UErrorCode *pErrorCode) {
    592     (void)pArgs;
    593     if(UCNV_GET_VERSION(cnv)<=1) {
    594         _UTF16BEReset(cnv, UCNV_RESET_BOTH);
    595     } else {
    596         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    597     }
    598 }
    599 
    600 static const char *  U_CALLCONV
    601 _UTF16BEGetName(const UConverter *cnv) {
    602     if(UCNV_GET_VERSION(cnv)==0) {
    603         return "UTF-16BE";
    604     } else {
    605         return "UTF-16BE,version=1";
    606     }
    607 }
    608 U_CDECL_END
    609 
    610 static const UConverterImpl _UTF16BEImpl={
    611     UCNV_UTF16_BigEndian,
    612 
    613     NULL,
    614     NULL,
    615 
    616     _UTF16BEOpen,
    617     NULL,
    618     _UTF16BEReset,
    619 
    620     _UTF16BEToUnicodeWithOffsets,
    621     _UTF16BEToUnicodeWithOffsets,
    622     _UTF16BEFromUnicodeWithOffsets,
    623     _UTF16BEFromUnicodeWithOffsets,
    624     _UTF16BEGetNextUChar,
    625 
    626     NULL,
    627     _UTF16BEGetName,
    628     NULL,
    629     NULL,
    630     ucnv_getNonSurrogateUnicodeSet,
    631 
    632     NULL,
    633     NULL
    634 };
    635 
    636 static const UConverterStaticData _UTF16BEStaticData={
    637     sizeof(UConverterStaticData),
    638     "UTF-16BE",
    639     1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
    640     { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
    641     0,
    642     0,
    643     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
    644 };
    645 
    646 
    647 const UConverterSharedData _UTF16BEData=
    648         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData, &_UTF16BEImpl);
    649 
    650 /* UTF-16LE ----------------------------------------------------------------- */
    651 U_CDECL_BEGIN
    652 static void  U_CALLCONV
    653 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
    654                                UErrorCode *pErrorCode) {
    655     UConverter *cnv;
    656     const UChar *source;
    657     char *target;
    658     int32_t *offsets;
    659 
    660     uint32_t targetCapacity, length, sourceIndex;
    661     UChar c, trail;
    662     char overflow[4];
    663 
    664     source=pArgs->source;
    665     length=(int32_t)(pArgs->sourceLimit-source);
    666     if(length<=0) {
    667         /* no input, nothing to do */
    668         return;
    669     }
    670 
    671     cnv=pArgs->converter;
    672 
    673     /* write the BOM if necessary */
    674     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    675         static const char bom[]={ (char)0xff, (char)0xfe };
    676         ucnv_fromUWriteBytes(cnv,
    677                              bom, 2,
    678                              &pArgs->target, pArgs->targetLimit,
    679                              &pArgs->offsets, -1,
    680                              pErrorCode);
    681         cnv->fromUnicodeStatus=0;
    682     }
    683 
    684     target=pArgs->target;
    685     if(target >= pArgs->targetLimit) {
    686         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    687         return;
    688     }
    689 
    690     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
    691     offsets=pArgs->offsets;
    692     sourceIndex=0;
    693 
    694     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
    695 
    696     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
    697         /* the last buffer ended with a lead surrogate, output the surrogate pair */
    698         ++source;
    699         --length;
    700         target[0]=(uint8_t)c;
    701         target[1]=(uint8_t)(c>>8);
    702         target[2]=(uint8_t)trail;
    703         target[3]=(uint8_t)(trail>>8);
    704         target+=4;
    705         targetCapacity-=4;
    706         if(offsets!=NULL) {
    707             *offsets++=-1;
    708             *offsets++=-1;
    709             *offsets++=-1;
    710             *offsets++=-1;
    711         }
    712         sourceIndex=1;
    713         cnv->fromUChar32=c=0;
    714     }
    715 
    716     if(c==0) {
    717         /* copy an even number of bytes for complete UChars */
    718         uint32_t count=2*length;
    719         if(count>targetCapacity) {
    720             count=targetCapacity&~1;
    721         }
    722         /* count is even */
    723         targetCapacity-=count;
    724         count>>=1;
    725         length-=count;
    726 
    727         if(offsets==NULL) {
    728             while(count>0) {
    729                 c=*source++;
    730                 if(U16_IS_SINGLE(c)) {
    731                     target[0]=(uint8_t)c;
    732                     target[1]=(uint8_t)(c>>8);
    733                     target+=2;
    734                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
    735                     ++source;
    736                     --count;
    737                     target[0]=(uint8_t)c;
    738                     target[1]=(uint8_t)(c>>8);
    739                     target[2]=(uint8_t)trail;
    740                     target[3]=(uint8_t)(trail>>8);
    741                     target+=4;
    742                 } else {
    743                     break;
    744                 }
    745                 --count;
    746             }
    747         } else {
    748             while(count>0) {
    749                 c=*source++;
    750                 if(U16_IS_SINGLE(c)) {
    751                     target[0]=(uint8_t)c;
    752                     target[1]=(uint8_t)(c>>8);
    753                     target+=2;
    754                     *offsets++=sourceIndex;
    755                     *offsets++=sourceIndex++;
    756                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
    757                     ++source;
    758                     --count;
    759                     target[0]=(uint8_t)c;
    760                     target[1]=(uint8_t)(c>>8);
    761                     target[2]=(uint8_t)trail;
    762                     target[3]=(uint8_t)(trail>>8);
    763                     target+=4;
    764                     *offsets++=sourceIndex;
    765                     *offsets++=sourceIndex;
    766                     *offsets++=sourceIndex;
    767                     *offsets++=sourceIndex;
    768                     sourceIndex+=2;
    769                 } else {
    770                     break;
    771                 }
    772                 --count;
    773             }
    774         }
    775 
    776         if(count==0) {
    777             /* done with the loop for complete UChars */
    778             if(length>0 && targetCapacity>0) {
    779                 /*
    780                  * there is more input and some target capacity -
    781                  * it must be targetCapacity==1 because otherwise
    782                  * the above would have copied more;
    783                  * prepare for overflow output
    784                  */
    785                 if(U16_IS_SINGLE(c=*source++)) {
    786                     overflow[0]=(char)c;
    787                     overflow[1]=(char)(c>>8);
    788                     length=2; /* 2 bytes to output */
    789                     c=0;
    790                 /* } else { keep c for surrogate handling, length will be set there */
    791                 }
    792             } else {
    793                 length=0;
    794                 c=0;
    795             }
    796         } else {
    797             /* keep c for surrogate handling, length will be set there */
    798             targetCapacity+=2*count;
    799         }
    800     } else {
    801         length=0; /* from here on, length counts the bytes in overflow[] */
    802     }
    803 
    804     if(c!=0) {
    805         /*
    806          * c is a surrogate, and
    807          * - source or target too short
    808          * - or the surrogate is unmatched
    809          */
    810         length=0;
    811         if(U16_IS_SURROGATE_LEAD(c)) {
    812             if(source<pArgs->sourceLimit) {
    813                 if(U16_IS_TRAIL(trail=*source)) {
    814                     /* output the surrogate pair, will overflow (see conditions comment above) */
    815                     ++source;
    816                     overflow[0]=(char)c;
    817                     overflow[1]=(char)(c>>8);
    818                     overflow[2]=(char)trail;
    819                     overflow[3]=(char)(trail>>8);
    820                     length=4; /* 4 bytes to output */
    821                     c=0;
    822                 } else {
    823                     /* unmatched lead surrogate */
    824                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    825                 }
    826             } else {
    827                 /* see if the trail surrogate is in the next buffer */
    828             }
    829         } else {
    830             /* unmatched trail surrogate */
    831             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    832         }
    833         cnv->fromUChar32=c;
    834     }
    835 
    836     if(length>0) {
    837         /* output length bytes with overflow (length>targetCapacity>0) */
    838         ucnv_fromUWriteBytes(cnv,
    839                              overflow, length,
    840                              &target, pArgs->targetLimit,
    841                              &offsets, sourceIndex,
    842                              pErrorCode);
    843         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
    844     }
    845 
    846     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
    847         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    848     }
    849 
    850     /* write back the updated pointers */
    851     pArgs->source=source;
    852     pArgs->target=target;
    853     pArgs->offsets=offsets;
    854 }
    855 
    856 static void  U_CALLCONV
    857 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    858                              UErrorCode *pErrorCode) {
    859     UConverter *cnv;
    860     const uint8_t *source;
    861     UChar *target;
    862     int32_t *offsets;
    863 
    864     uint32_t targetCapacity, length, count, sourceIndex;
    865     UChar c, trail;
    866 
    867     if(pArgs->converter->mode<8) {
    868         _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
    869         return;
    870     }
    871 
    872     cnv=pArgs->converter;
    873     source=(const uint8_t *)pArgs->source;
    874     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
    875     if(length<=0 && cnv->toUnicodeStatus==0) {
    876         /* no input, nothing to do */
    877         return;
    878     }
    879 
    880     target=pArgs->target;
    881     if(target >= pArgs->targetLimit) {
    882         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    883         return;
    884     }
    885 
    886     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
    887     offsets=pArgs->offsets;
    888     sourceIndex=0;
    889     c=0;
    890 
    891     /* complete a partial UChar or pair from the last call */
    892     if(cnv->toUnicodeStatus!=0) {
    893         /*
    894          * special case: single byte from a previous buffer,
    895          * where the byte turned out not to belong to a trail surrogate
    896          * and the preceding, unmatched lead surrogate was put into toUBytes[]
    897          * for error handling
    898          */
    899         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
    900         cnv->toULength=1;
    901         cnv->toUnicodeStatus=0;
    902     }
    903     if((count=cnv->toULength)!=0) {
    904         uint8_t *p=cnv->toUBytes;
    905         do {
    906             p[count++]=*source++;
    907             ++sourceIndex;
    908             --length;
    909             if(count==2) {
    910                 c=((UChar)p[1]<<8)|p[0];
    911                 if(U16_IS_SINGLE(c)) {
    912                     /* output the BMP code point */
    913                     *target++=c;
    914                     if(offsets!=NULL) {
    915                         *offsets++=-1;
    916                     }
    917                     --targetCapacity;
    918                     count=0;
    919                     c=0;
    920                     break;
    921                 } else if(U16_IS_SURROGATE_LEAD(c)) {
    922                     /* continue collecting bytes for the trail surrogate */
    923                     c=0; /* avoid unnecessary surrogate handling below */
    924                 } else {
    925                     /* fall through to error handling for an unmatched trail surrogate */
    926                     break;
    927                 }
    928             } else if(count==4) {
    929                 c=((UChar)p[1]<<8)|p[0];
    930                 trail=((UChar)p[3]<<8)|p[2];
    931                 if(U16_IS_TRAIL(trail)) {
    932                     /* output the surrogate pair */
    933                     *target++=c;
    934                     if(targetCapacity>=2) {
    935                         *target++=trail;
    936                         if(offsets!=NULL) {
    937                             *offsets++=-1;
    938                             *offsets++=-1;
    939                         }
    940                         targetCapacity-=2;
    941                     } else /* targetCapacity==1 */ {
    942                         targetCapacity=0;
    943                         cnv->UCharErrorBuffer[0]=trail;
    944                         cnv->UCharErrorBufferLength=1;
    945                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    946                     }
    947                     count=0;
    948                     c=0;
    949                     break;
    950                 } else {
    951                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
    952                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    953 
    954                     /* back out reading the code unit after it */
    955                     if(((const uint8_t *)pArgs->source-source)>=2) {
    956                         source-=2;
    957                     } else {
    958                         /*
    959                          * if the trail unit's first byte was in a previous buffer, then
    960                          * we need to put it into a special place because toUBytes[] will be
    961                          * used for the lead unit's bytes
    962                          */
    963                         cnv->toUnicodeStatus=0x100|p[2];
    964                         --source;
    965                     }
    966                     cnv->toULength=2;
    967 
    968                     /* write back the updated pointers */
    969                     pArgs->source=(const char *)source;
    970                     pArgs->target=target;
    971                     pArgs->offsets=offsets;
    972                     return;
    973                 }
    974             }
    975         } while(length>0);
    976         cnv->toULength=(int8_t)count;
    977     }
    978 
    979     /* copy an even number of bytes for complete UChars */
    980     count=2*targetCapacity;
    981     if(count>length) {
    982         count=length&~1;
    983     }
    984     if(c==0 && count>0) {
    985         length-=count;
    986         count>>=1;
    987         targetCapacity-=count;
    988         if(offsets==NULL) {
    989             do {
    990                 c=((UChar)source[1]<<8)|source[0];
    991                 source+=2;
    992                 if(U16_IS_SINGLE(c)) {
    993                     *target++=c;
    994                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
    995                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
    996                 ) {
    997                     source+=2;
    998                     --count;
    999                     *target++=c;
   1000                     *target++=trail;
   1001                 } else {
   1002                     break;
   1003                 }
   1004             } while(--count>0);
   1005         } else {
   1006             do {
   1007                 c=((UChar)source[1]<<8)|source[0];
   1008                 source+=2;
   1009                 if(U16_IS_SINGLE(c)) {
   1010                     *target++=c;
   1011                     *offsets++=sourceIndex;
   1012                     sourceIndex+=2;
   1013                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
   1014                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
   1015                 ) {
   1016                     source+=2;
   1017                     --count;
   1018                     *target++=c;
   1019                     *target++=trail;
   1020                     *offsets++=sourceIndex;
   1021                     *offsets++=sourceIndex;
   1022                     sourceIndex+=4;
   1023                 } else {
   1024                     break;
   1025                 }
   1026             } while(--count>0);
   1027         }
   1028 
   1029         if(count==0) {
   1030             /* done with the loop for complete UChars */
   1031             c=0;
   1032         } else {
   1033             /* keep c for surrogate handling, trail will be set there */
   1034             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
   1035             targetCapacity+=count;
   1036         }
   1037     }
   1038 
   1039     if(c!=0) {
   1040         /*
   1041          * c is a surrogate, and
   1042          * - source or target too short
   1043          * - or the surrogate is unmatched
   1044          */
   1045         cnv->toUBytes[0]=(uint8_t)c;
   1046         cnv->toUBytes[1]=(uint8_t)(c>>8);
   1047         cnv->toULength=2;
   1048 
   1049         if(U16_IS_SURROGATE_LEAD(c)) {
   1050             if(length>=2) {
   1051                 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
   1052                     /* output the surrogate pair, will overflow (see conditions comment above) */
   1053                     source+=2;
   1054                     length-=2;
   1055                     *target++=c;
   1056                     if(offsets!=NULL) {
   1057                         *offsets++=sourceIndex;
   1058                     }
   1059                     cnv->UCharErrorBuffer[0]=trail;
   1060                     cnv->UCharErrorBufferLength=1;
   1061                     cnv->toULength=0;
   1062                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1063                 } else {
   1064                     /* unmatched lead surrogate */
   1065                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1066                 }
   1067             } else {
   1068                 /* see if the trail surrogate is in the next buffer */
   1069             }
   1070         } else {
   1071             /* unmatched trail surrogate */
   1072             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1073         }
   1074     }
   1075 
   1076     if(U_SUCCESS(*pErrorCode)) {
   1077         /* check for a remaining source byte */
   1078         if(length>0) {
   1079             if(targetCapacity==0) {
   1080                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1081             } else {
   1082                 /* it must be length==1 because otherwise the above would have copied more */
   1083                 cnv->toUBytes[cnv->toULength++]=*source++;
   1084             }
   1085         }
   1086     }
   1087 
   1088     /* write back the updated pointers */
   1089     pArgs->source=(const char *)source;
   1090     pArgs->target=target;
   1091     pArgs->offsets=offsets;
   1092 }
   1093 
   1094 static UChar32  U_CALLCONV
   1095 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
   1096     const uint8_t *s, *sourceLimit;
   1097     UChar32 c;
   1098 
   1099     if(pArgs->converter->mode<8) {
   1100         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   1101     }
   1102 
   1103     s=(const uint8_t *)pArgs->source;
   1104     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   1105 
   1106     if(s>=sourceLimit) {
   1107         /* no input */
   1108         *err=U_INDEX_OUTOFBOUNDS_ERROR;
   1109         return 0xffff;
   1110     }
   1111 
   1112     if(s+2>sourceLimit) {
   1113         /* only one byte: truncated UChar */
   1114         pArgs->converter->toUBytes[0]=*s++;
   1115         pArgs->converter->toULength=1;
   1116         pArgs->source=(const char *)s;
   1117         *err = U_TRUNCATED_CHAR_FOUND;
   1118         return 0xffff;
   1119     }
   1120 
   1121     /* get one UChar */
   1122     c=((UChar32)s[1]<<8)|*s;
   1123     s+=2;
   1124 
   1125     /* check for a surrogate pair */
   1126     if(U_IS_SURROGATE(c)) {
   1127         if(U16_IS_SURROGATE_LEAD(c)) {
   1128             if(s+2<=sourceLimit) {
   1129                 UChar trail;
   1130 
   1131                 /* get a second UChar and see if it is a trail surrogate */
   1132                 trail=((UChar)s[1]<<8)|*s;
   1133                 if(U16_IS_TRAIL(trail)) {
   1134                     c=U16_GET_SUPPLEMENTARY(c, trail);
   1135                     s+=2;
   1136                 } else {
   1137                     /* unmatched lead surrogate */
   1138                     c=-2;
   1139                 }
   1140             } else {
   1141                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
   1142                 uint8_t *bytes=pArgs->converter->toUBytes;
   1143                 s-=2;
   1144                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
   1145                 do {
   1146                     *bytes++=*s++;
   1147                 } while(s<sourceLimit);
   1148 
   1149                 c=0xffff;
   1150                 *err=U_TRUNCATED_CHAR_FOUND;
   1151             }
   1152         } else {
   1153             /* unmatched trail surrogate */
   1154             c=-2;
   1155         }
   1156 
   1157         if(c<0) {
   1158             /* write the unmatched surrogate */
   1159             uint8_t *bytes=pArgs->converter->toUBytes;
   1160             pArgs->converter->toULength=2;
   1161             *bytes=*(s-2);
   1162             bytes[1]=*(s-1);
   1163 
   1164             c=0xffff;
   1165             *err=U_ILLEGAL_CHAR_FOUND;
   1166         }
   1167     }
   1168 
   1169     pArgs->source=(const char *)s;
   1170     return c;
   1171 }
   1172 
   1173 static void  U_CALLCONV
   1174 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
   1175     if(choice<=UCNV_RESET_TO_UNICODE) {
   1176         /* reset toUnicode state */
   1177         if(UCNV_GET_VERSION(cnv)==0) {
   1178             cnv->mode=8; /* no BOM handling */
   1179         } else {
   1180             cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
   1181         }
   1182     }
   1183     if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
   1184         /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
   1185         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
   1186     }
   1187 }
   1188 
   1189 static void  U_CALLCONV
   1190 _UTF16LEOpen(UConverter *cnv,
   1191              UConverterLoadArgs *pArgs,
   1192              UErrorCode *pErrorCode) {
   1193     (void)pArgs;
   1194     if(UCNV_GET_VERSION(cnv)<=1) {
   1195         _UTF16LEReset(cnv, UCNV_RESET_BOTH);
   1196     } else {
   1197         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1198     }
   1199 }
   1200 
   1201 static const char *  U_CALLCONV
   1202 _UTF16LEGetName(const UConverter *cnv) {
   1203     if(UCNV_GET_VERSION(cnv)==0) {
   1204         return "UTF-16LE";
   1205     } else {
   1206         return "UTF-16LE,version=1";
   1207     }
   1208 }
   1209 U_CDECL_END
   1210 
   1211 static const UConverterImpl _UTF16LEImpl={
   1212     UCNV_UTF16_LittleEndian,
   1213 
   1214     NULL,
   1215     NULL,
   1216 
   1217     _UTF16LEOpen,
   1218     NULL,
   1219     _UTF16LEReset,
   1220 
   1221     _UTF16LEToUnicodeWithOffsets,
   1222     _UTF16LEToUnicodeWithOffsets,
   1223     _UTF16LEFromUnicodeWithOffsets,
   1224     _UTF16LEFromUnicodeWithOffsets,
   1225     _UTF16LEGetNextUChar,
   1226 
   1227     NULL,
   1228     _UTF16LEGetName,
   1229     NULL,
   1230     NULL,
   1231     ucnv_getNonSurrogateUnicodeSet,
   1232 
   1233     NULL,
   1234     NULL
   1235 };
   1236 
   1237 
   1238 static const UConverterStaticData _UTF16LEStaticData={
   1239     sizeof(UConverterStaticData),
   1240     "UTF-16LE",
   1241     1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
   1242     { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
   1243     0,
   1244     0,
   1245     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1246 };
   1247 
   1248 
   1249 const UConverterSharedData _UTF16LEData=
   1250         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData, &_UTF16LEImpl);
   1251 
   1252 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
   1253 
   1254 /*
   1255  * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
   1256  * accordingly.
   1257  * This is a simpler version of the UTF-32 converter, with
   1258  * fewer states for shorter BOMs.
   1259  *
   1260  * State values:
   1261  * 0    initial state
   1262  * 1    saw first byte
   1263  * 2..5 -
   1264  * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
   1265  * 8    UTF-16BE mode
   1266  * 9    UTF-16LE mode
   1267  *
   1268  * During detection: state==number of initial bytes seen so far.
   1269  *
   1270  * On output, emit U+FEFF as the first code point.
   1271  *
   1272  * Variants:
   1273  * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
   1274  * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
   1275  *   UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
   1276  */
   1277 U_CDECL_BEGIN
   1278 static void  U_CALLCONV
   1279 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
   1280     if(choice<=UCNV_RESET_TO_UNICODE) {
   1281         /* reset toUnicode: state=0 */
   1282         cnv->mode=0;
   1283     }
   1284     if(choice!=UCNV_RESET_TO_UNICODE) {
   1285         /* reset fromUnicode: prepare to output the UTF-16PE BOM */
   1286         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
   1287     }
   1288 }
   1289 U_CDECL_END
   1290 extern const UConverterSharedData _UTF16v2Data;
   1291 U_CDECL_BEGIN
   1292 static void U_CALLCONV
   1293 _UTF16Open(UConverter *cnv,
   1294            UConverterLoadArgs *pArgs,
   1295            UErrorCode *pErrorCode) {
   1296     if(UCNV_GET_VERSION(cnv)<=2) {
   1297         if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
   1298             /*
   1299              * Switch implementation, and switch the staticData that's different
   1300              * and was copied into the UConverter.
   1301              * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
   1302              * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
   1303              */
   1304             cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
   1305             uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
   1306         }
   1307         _UTF16Reset(cnv, UCNV_RESET_BOTH);
   1308     } else {
   1309         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1310     }
   1311 }
   1312 
   1313 static const char *  U_CALLCONV
   1314 _UTF16GetName(const UConverter *cnv) {
   1315     if(UCNV_GET_VERSION(cnv)==0) {
   1316         return "UTF-16";
   1317     } else if(UCNV_GET_VERSION(cnv)==1) {
   1318         return "UTF-16,version=1";
   1319     } else {
   1320         return "UTF-16,version=2";
   1321     }
   1322 }
   1323 U_CDECL_END
   1324 extern const UConverterSharedData _UTF16Data;
   1325 
   1326 static inline bool IS_UTF16BE(const UConverter *cnv) {
   1327     return ((cnv)->sharedData == &_UTF16BEData);
   1328 }
   1329 
   1330 static inline bool IS_UTF16LE(const UConverter *cnv) {
   1331     return ((cnv)->sharedData == &_UTF16LEData);
   1332 }
   1333 
   1334 static inline bool IS_UTF16(const UConverter *cnv) {
   1335     return ((cnv)->sharedData==&_UTF16Data) || ((cnv)->sharedData == &_UTF16v2Data);
   1336 }
   1337 
   1338 U_CDECL_BEGIN
   1339 static void U_CALLCONV
   1340 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   1341                            UErrorCode *pErrorCode) {
   1342     UConverter *cnv=pArgs->converter;
   1343     const char *source=pArgs->source;
   1344     const char *sourceLimit=pArgs->sourceLimit;
   1345     int32_t *offsets=pArgs->offsets;
   1346 
   1347     int32_t state, offsetDelta;
   1348     uint8_t b;
   1349 
   1350     state=cnv->mode;
   1351 
   1352     /*
   1353      * If we detect a BOM in this buffer, then we must add the BOM size to the
   1354      * offsets because the actual converter function will not see and count the BOM.
   1355      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
   1356      */
   1357     offsetDelta=0;
   1358 
   1359     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
   1360         switch(state) {
   1361         case 0:
   1362             cnv->toUBytes[0]=(uint8_t)*source++;
   1363             cnv->toULength=1;
   1364             state=1;
   1365             break;
   1366         case 1:
   1367             /*
   1368              * Only inside this switch case can the state variable
   1369              * temporarily take two additional values:
   1370              * 6: BOM error, continue with BE
   1371              * 7: BOM error, continue with LE
   1372              */
   1373             b=*source;
   1374             if(cnv->toUBytes[0]==0xfe && b==0xff) {
   1375                 if(IS_UTF16LE(cnv)) {
   1376                     state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
   1377                 } else {
   1378                     state=8; /* detect UTF-16BE */
   1379                 }
   1380             } else if(cnv->toUBytes[0]==0xff && b==0xfe) {
   1381                 if(IS_UTF16BE(cnv)) {
   1382                     state=6; /* illegal reverse BOM for Java "UnicodeBig" */
   1383                 } else {
   1384                     state=9; /* detect UTF-16LE */
   1385                 }
   1386             } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
   1387                 state=6; /* illegal missing BOM for Java "Unicode" */
   1388             }
   1389             if(state>=8) {
   1390                 /* BOM detected, consume it */
   1391                 ++source;
   1392                 cnv->toULength=0;
   1393                 offsetDelta=(int32_t)(source-pArgs->source);
   1394             } else if(state<6) {
   1395                 /* ok: no BOM, and not a reverse BOM */
   1396                 if(source!=pArgs->source) {
   1397                     /* reset the source for a correct first offset */
   1398                     source=pArgs->source;
   1399                     cnv->toULength=0;
   1400                 }
   1401                 if(IS_UTF16LE(cnv)) {
   1402                     /* Make Java "UnicodeLittle" default to LE. */
   1403                     state=9;
   1404                 } else {
   1405                     /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
   1406                     state=8;
   1407                 }
   1408             } else {
   1409                 /*
   1410                  * error: missing BOM, or reverse BOM
   1411                  * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
   1412                  * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
   1413                  * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
   1414                  */
   1415                 /* report the non-BOM or reverse BOM as an illegal sequence */
   1416                 cnv->toUBytes[1]=b;
   1417                 cnv->toULength=2;
   1418                 pArgs->source=source+1;
   1419                 /* continue with conversion if the callback resets the error */
   1420                 /*
   1421                  * Make Java "Unicode" default to BE like standard UTF-16.
   1422                  * Make Java "UnicodeBig" and "UnicodeLittle" default
   1423                  * to their normal endiannesses.
   1424                  */
   1425                 cnv->mode=state+2;
   1426                 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
   1427                 return;
   1428             }
   1429             /* convert the rest of the stream */
   1430             cnv->mode=state;
   1431             continue;
   1432         case 8:
   1433             /* call UTF-16BE */
   1434             pArgs->source=source;
   1435             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
   1436             source=pArgs->source;
   1437             break;
   1438         case 9:
   1439             /* call UTF-16LE */
   1440             pArgs->source=source;
   1441             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
   1442             source=pArgs->source;
   1443             break;
   1444         default:
   1445             break; /* does not occur */
   1446         }
   1447     }
   1448 
   1449     /* add BOM size to offsets - see comment at offsetDelta declaration */
   1450     if(offsets!=NULL && offsetDelta!=0) {
   1451         int32_t *offsetsLimit=pArgs->offsets;
   1452         while(offsets<offsetsLimit) {
   1453             *offsets++ += offsetDelta;
   1454         }
   1455     }
   1456 
   1457     pArgs->source=source;
   1458 
   1459     if(source==sourceLimit && pArgs->flush) {
   1460         /* handle truncated input */
   1461         switch(state) {
   1462         case 0:
   1463             break; /* no input at all, nothing to do */
   1464         case 8:
   1465             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
   1466             break;
   1467         case 9:
   1468             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
   1469             break;
   1470         default:
   1471             /* 0<state<8: framework will report truncation, nothing to do here */
   1472             break;
   1473         }
   1474     }
   1475 
   1476     cnv->mode=state;
   1477 }
   1478 
   1479 static UChar32 U_CALLCONV
   1480 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
   1481                    UErrorCode *pErrorCode) {
   1482     switch(pArgs->converter->mode) {
   1483     case 8:
   1484         return _UTF16BEGetNextUChar(pArgs, pErrorCode);
   1485     case 9:
   1486         return _UTF16LEGetNextUChar(pArgs, pErrorCode);
   1487     default:
   1488         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   1489     }
   1490 }
   1491 U_CDECL_END
   1492 
   1493 static const UConverterImpl _UTF16Impl = {
   1494     UCNV_UTF16,
   1495 
   1496     NULL,
   1497     NULL,
   1498 
   1499     _UTF16Open,
   1500     NULL,
   1501     _UTF16Reset,
   1502 
   1503     _UTF16ToUnicodeWithOffsets,
   1504     _UTF16ToUnicodeWithOffsets,
   1505     _UTF16PEFromUnicodeWithOffsets,
   1506     _UTF16PEFromUnicodeWithOffsets,
   1507     _UTF16GetNextUChar,
   1508 
   1509     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
   1510     _UTF16GetName,
   1511     NULL,
   1512     NULL,
   1513     ucnv_getNonSurrogateUnicodeSet,
   1514 
   1515     NULL,
   1516     NULL
   1517 };
   1518 
   1519 static const UConverterStaticData _UTF16StaticData = {
   1520     sizeof(UConverterStaticData),
   1521     "UTF-16",
   1522     1204, /* CCSID for BOM sensitive UTF-16 */
   1523     UCNV_IBM, UCNV_UTF16, 2, 2,
   1524 #if U_IS_BIG_ENDIAN
   1525     { 0xff, 0xfd, 0, 0 }, 2,
   1526 #else
   1527     { 0xfd, 0xff, 0, 0 }, 2,
   1528 #endif
   1529     FALSE, FALSE,
   1530     0,
   1531     0,
   1532     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1533 };
   1534 
   1535 const UConverterSharedData _UTF16Data =
   1536         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData, &_UTF16Impl);
   1537 
   1538 static const UConverterImpl _UTF16v2Impl = {
   1539     UCNV_UTF16,
   1540 
   1541     NULL,
   1542     NULL,
   1543 
   1544     _UTF16Open,
   1545     NULL,
   1546     _UTF16Reset,
   1547 
   1548     _UTF16ToUnicodeWithOffsets,
   1549     _UTF16ToUnicodeWithOffsets,
   1550     _UTF16BEFromUnicodeWithOffsets,
   1551     _UTF16BEFromUnicodeWithOffsets,
   1552     _UTF16GetNextUChar,
   1553 
   1554     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
   1555     _UTF16GetName,
   1556     NULL,
   1557     NULL,
   1558     ucnv_getNonSurrogateUnicodeSet,
   1559 
   1560     NULL,
   1561     NULL
   1562 };
   1563 
   1564 static const UConverterStaticData _UTF16v2StaticData = {
   1565     sizeof(UConverterStaticData),
   1566     "UTF-16,version=2",
   1567     1204, /* CCSID for BOM sensitive UTF-16 */
   1568     UCNV_IBM, UCNV_UTF16, 2, 2,
   1569     { 0xff, 0xfd, 0, 0 }, 2,
   1570     FALSE, FALSE,
   1571     0,
   1572     0,
   1573     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1574 };
   1575 
   1576 const UConverterSharedData _UTF16v2Data =
   1577         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData, &_UTF16v2Impl);
   1578 
   1579 #endif
   1580