Home | History | Annotate | Download | only in common
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2002-2010, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   file name:  ucnv_u16.c
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2002jul01
     12 *   created by: Markus W. Scherer
     13 *
     14 *   UTF-16 converter implementation. Used to be in ucnv_utf.c.
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_CONVERSION
     20 
     21 #include "unicode/ucnv.h"
     22 #include "ucnv_bld.h"
     23 #include "ucnv_cnv.h"
     24 #include "cmemory.h"
     25 
     26 enum {
     27     UCNV_NEED_TO_WRITE_BOM=1
     28 };
     29 
     30 /*
     31  * The UTF-16 toUnicode implementation is also used for the Java-specific
     32  * "with BOM" variants of UTF-16BE and UTF-16LE.
     33  */
     34 static void
     35 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
     36                            UErrorCode *pErrorCode);
     37 
     38 /* UTF-16BE ----------------------------------------------------------------- */
     39 
     40 #if U_IS_BIG_ENDIAN
     41 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16BEFromUnicodeWithOffsets
     42 #else
     43 #   define _UTF16PEFromUnicodeWithOffsets   _UTF16LEFromUnicodeWithOffsets
     44 #endif
     45 
     46 
     47 static void
     48 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
     49                                UErrorCode *pErrorCode) {
     50     UConverter *cnv;
     51     const UChar *source;
     52     char *target;
     53     int32_t *offsets;
     54 
     55     uint32_t targetCapacity, length, sourceIndex;
     56     UChar c, trail;
     57     char overflow[4];
     58 
     59     source=pArgs->source;
     60     length=(int32_t)(pArgs->sourceLimit-source);
     61     if(length<=0) {
     62         /* no input, nothing to do */
     63         return;
     64     }
     65 
     66     cnv=pArgs->converter;
     67 
     68     /* write the BOM if necessary */
     69     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
     70         static const char bom[]={ (char)0xfe, (char)0xff };
     71         ucnv_fromUWriteBytes(cnv,
     72                              bom, 2,
     73                              &pArgs->target, pArgs->targetLimit,
     74                              &pArgs->offsets, -1,
     75                              pErrorCode);
     76         cnv->fromUnicodeStatus=0;
     77     }
     78 
     79     target=pArgs->target;
     80     if(target >= pArgs->targetLimit) {
     81         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
     82         return;
     83     }
     84 
     85     targetCapacity=(uint32_t)(pArgs->targetLimit-target);
     86     offsets=pArgs->offsets;
     87     sourceIndex=0;
     88 
     89     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
     90 
     91     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
     92         /* the last buffer ended with a lead surrogate, output the surrogate pair */
     93         ++source;
     94         --length;
     95         target[0]=(uint8_t)(c>>8);
     96         target[1]=(uint8_t)c;
     97         target[2]=(uint8_t)(trail>>8);
     98         target[3]=(uint8_t)trail;
     99         target+=4;
    100         targetCapacity-=4;
    101         if(offsets!=NULL) {
    102             *offsets++=-1;
    103             *offsets++=-1;
    104             *offsets++=-1;
    105             *offsets++=-1;
    106         }
    107         sourceIndex=1;
    108         cnv->fromUChar32=c=0;
    109     }
    110 
    111     if(c==0) {
    112         /* copy an even number of bytes for complete UChars */
    113         uint32_t count=2*length;
    114         if(count>targetCapacity) {
    115             count=targetCapacity&~1;
    116         }
    117         /* count is even */
    118         targetCapacity-=count;
    119         count>>=1;
    120         length-=count;
    121 
    122         if(offsets==NULL) {
    123             while(count>0) {
    124                 c=*source++;
    125                 if(U16_IS_SINGLE(c)) {
    126                     target[0]=(uint8_t)(c>>8);
    127                     target[1]=(uint8_t)c;
    128                     target+=2;
    129                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
    130                     ++source;
    131                     --count;
    132                     target[0]=(uint8_t)(c>>8);
    133                     target[1]=(uint8_t)c;
    134                     target[2]=(uint8_t)(trail>>8);
    135                     target[3]=(uint8_t)trail;
    136                     target+=4;
    137                 } else {
    138                     break;
    139                 }
    140                 --count;
    141             }
    142         } else {
    143             while(count>0) {
    144                 c=*source++;
    145                 if(U16_IS_SINGLE(c)) {
    146                     target[0]=(uint8_t)(c>>8);
    147                     target[1]=(uint8_t)c;
    148                     target+=2;
    149                     *offsets++=sourceIndex;
    150                     *offsets++=sourceIndex++;
    151                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
    152                     ++source;
    153                     --count;
    154                     target[0]=(uint8_t)(c>>8);
    155                     target[1]=(uint8_t)c;
    156                     target[2]=(uint8_t)(trail>>8);
    157                     target[3]=(uint8_t)trail;
    158                     target+=4;
    159                     *offsets++=sourceIndex;
    160                     *offsets++=sourceIndex;
    161                     *offsets++=sourceIndex;
    162                     *offsets++=sourceIndex;
    163                     sourceIndex+=2;
    164                 } else {
    165                     break;
    166                 }
    167                 --count;
    168             }
    169         }
    170 
    171         if(count==0) {
    172             /* done with the loop for complete UChars */
    173             if(length>0 && targetCapacity>0) {
    174                 /*
    175                  * there is more input and some target capacity -
    176                  * it must be targetCapacity==1 because otherwise
    177                  * the above would have copied more;
    178                  * prepare for overflow output
    179                  */
    180                 if(U16_IS_SINGLE(c=*source++)) {
    181                     overflow[0]=(char)(c>>8);
    182                     overflow[1]=(char)c;
    183                     length=2; /* 2 bytes to output */
    184                     c=0;
    185                 /* } else { keep c for surrogate handling, length will be set there */
    186                 }
    187             } else {
    188                 length=0;
    189                 c=0;
    190             }
    191         } else {
    192             /* keep c for surrogate handling, length will be set there */
    193             targetCapacity+=2*count;
    194         }
    195     } else {
    196         length=0; /* from here on, length counts the bytes in overflow[] */
    197     }
    198 
    199     if(c!=0) {
    200         /*
    201          * c is a surrogate, and
    202          * - source or target too short
    203          * - or the surrogate is unmatched
    204          */
    205         length=0;
    206         if(U16_IS_SURROGATE_LEAD(c)) {
    207             if(source<pArgs->sourceLimit) {
    208                 if(U16_IS_TRAIL(trail=*source)) {
    209                     /* output the surrogate pair, will overflow (see conditions comment above) */
    210                     ++source;
    211                     overflow[0]=(char)(c>>8);
    212                     overflow[1]=(char)c;
    213                     overflow[2]=(char)(trail>>8);
    214                     overflow[3]=(char)trail;
    215                     length=4; /* 4 bytes to output */
    216                     c=0;
    217                 } else {
    218                     /* unmatched lead surrogate */
    219                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    220                 }
    221             } else {
    222                 /* see if the trail surrogate is in the next buffer */
    223             }
    224         } else {
    225             /* unmatched trail surrogate */
    226             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    227         }
    228         cnv->fromUChar32=c;
    229     }
    230 
    231     if(length>0) {
    232         /* output length bytes with overflow (length>targetCapacity>0) */
    233         ucnv_fromUWriteBytes(cnv,
    234                              overflow, length,
    235                              (char **)&target, pArgs->targetLimit,
    236                              &offsets, sourceIndex,
    237                              pErrorCode);
    238         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
    239     }
    240 
    241     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
    242         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    243     }
    244 
    245     /* write back the updated pointers */
    246     pArgs->source=source;
    247     pArgs->target=(char *)target;
    248     pArgs->offsets=offsets;
    249 }
    250 
    251 static void
    252 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    253                              UErrorCode *pErrorCode) {
    254     UConverter *cnv;
    255     const uint8_t *source;
    256     UChar *target;
    257     int32_t *offsets;
    258 
    259     uint32_t targetCapacity, length, count, sourceIndex;
    260     UChar c, trail;
    261 
    262     if(pArgs->converter->mode<8) {
    263         _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
    264         return;
    265     }
    266 
    267     cnv=pArgs->converter;
    268     source=(const uint8_t *)pArgs->source;
    269     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
    270     if(length<=0 && cnv->toUnicodeStatus==0) {
    271         /* no input, nothing to do */
    272         return;
    273     }
    274 
    275     target=pArgs->target;
    276     if(target >= pArgs->targetLimit) {
    277         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    278         return;
    279     }
    280 
    281     targetCapacity=(uint32_t)(pArgs->targetLimit-target);
    282     offsets=pArgs->offsets;
    283     sourceIndex=0;
    284     c=0;
    285 
    286     /* complete a partial UChar or pair from the last call */
    287     if(cnv->toUnicodeStatus!=0) {
    288         /*
    289          * special case: single byte from a previous buffer,
    290          * where the byte turned out not to belong to a trail surrogate
    291          * and the preceding, unmatched lead surrogate was put into toUBytes[]
    292          * for error handling
    293          */
    294         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
    295         cnv->toULength=1;
    296         cnv->toUnicodeStatus=0;
    297     }
    298     if((count=cnv->toULength)!=0) {
    299         uint8_t *p=cnv->toUBytes;
    300         do {
    301             p[count++]=*source++;
    302             ++sourceIndex;
    303             --length;
    304             if(count==2) {
    305                 c=((UChar)p[0]<<8)|p[1];
    306                 if(U16_IS_SINGLE(c)) {
    307                     /* output the BMP code point */
    308                     *target++=c;
    309                     if(offsets!=NULL) {
    310                         *offsets++=-1;
    311                     }
    312                     --targetCapacity;
    313                     count=0;
    314                     c=0;
    315                     break;
    316                 } else if(U16_IS_SURROGATE_LEAD(c)) {
    317                     /* continue collecting bytes for the trail surrogate */
    318                     c=0; /* avoid unnecessary surrogate handling below */
    319                 } else {
    320                     /* fall through to error handling for an unmatched trail surrogate */
    321                     break;
    322                 }
    323             } else if(count==4) {
    324                 c=((UChar)p[0]<<8)|p[1];
    325                 trail=((UChar)p[2]<<8)|p[3];
    326                 if(U16_IS_TRAIL(trail)) {
    327                     /* output the surrogate pair */
    328                     *target++=c;
    329                     if(targetCapacity>=2) {
    330                         *target++=trail;
    331                         if(offsets!=NULL) {
    332                             *offsets++=-1;
    333                             *offsets++=-1;
    334                         }
    335                         targetCapacity-=2;
    336                     } else /* targetCapacity==1 */ {
    337                         targetCapacity=0;
    338                         cnv->UCharErrorBuffer[0]=trail;
    339                         cnv->UCharErrorBufferLength=1;
    340                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    341                     }
    342                     count=0;
    343                     c=0;
    344                     break;
    345                 } else {
    346                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
    347                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    348 
    349                     /* back out reading the code unit after it */
    350                     if(((const uint8_t *)pArgs->source-source)>=2) {
    351                         source-=2;
    352                     } else {
    353                         /*
    354                          * if the trail unit's first byte was in a previous buffer, then
    355                          * we need to put it into a special place because toUBytes[] will be
    356                          * used for the lead unit's bytes
    357                          */
    358                         cnv->toUnicodeStatus=0x100|p[2];
    359                         --source;
    360                     }
    361                     cnv->toULength=2;
    362 
    363                     /* write back the updated pointers */
    364                     pArgs->source=(const char *)source;
    365                     pArgs->target=target;
    366                     pArgs->offsets=offsets;
    367                     return;
    368                 }
    369             }
    370         } while(length>0);
    371         cnv->toULength=(int8_t)count;
    372     }
    373 
    374     /* copy an even number of bytes for complete UChars */
    375     count=2*targetCapacity;
    376     if(count>length) {
    377         count=length&~1;
    378     }
    379     if(c==0 && count>0) {
    380         length-=count;
    381         count>>=1;
    382         targetCapacity-=count;
    383         if(offsets==NULL) {
    384             do {
    385                 c=((UChar)source[0]<<8)|source[1];
    386                 source+=2;
    387                 if(U16_IS_SINGLE(c)) {
    388                     *target++=c;
    389                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
    390                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
    391                 ) {
    392                     source+=2;
    393                     --count;
    394                     *target++=c;
    395                     *target++=trail;
    396                 } else {
    397                     break;
    398                 }
    399             } while(--count>0);
    400         } else {
    401             do {
    402                 c=((UChar)source[0]<<8)|source[1];
    403                 source+=2;
    404                 if(U16_IS_SINGLE(c)) {
    405                     *target++=c;
    406                     *offsets++=sourceIndex;
    407                     sourceIndex+=2;
    408                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
    409                           U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])
    410                 ) {
    411                     source+=2;
    412                     --count;
    413                     *target++=c;
    414                     *target++=trail;
    415                     *offsets++=sourceIndex;
    416                     *offsets++=sourceIndex;
    417                     sourceIndex+=4;
    418                 } else {
    419                     break;
    420                 }
    421             } while(--count>0);
    422         }
    423 
    424         if(count==0) {
    425             /* done with the loop for complete UChars */
    426             c=0;
    427         } else {
    428             /* keep c for surrogate handling, trail will be set there */
    429             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
    430             targetCapacity+=count;
    431         }
    432     }
    433 
    434     if(c!=0) {
    435         /*
    436          * c is a surrogate, and
    437          * - source or target too short
    438          * - or the surrogate is unmatched
    439          */
    440         cnv->toUBytes[0]=(uint8_t)(c>>8);
    441         cnv->toUBytes[1]=(uint8_t)c;
    442         cnv->toULength=2;
    443 
    444         if(U16_IS_SURROGATE_LEAD(c)) {
    445             if(length>=2) {
    446                 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) {
    447                     /* output the surrogate pair, will overflow (see conditions comment above) */
    448                     source+=2;
    449                     length-=2;
    450                     *target++=c;
    451                     if(offsets!=NULL) {
    452                         *offsets++=sourceIndex;
    453                     }
    454                     cnv->UCharErrorBuffer[0]=trail;
    455                     cnv->UCharErrorBufferLength=1;
    456                     cnv->toULength=0;
    457                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    458                 } else {
    459                     /* unmatched lead surrogate */
    460                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    461                 }
    462             } else {
    463                 /* see if the trail surrogate is in the next buffer */
    464             }
    465         } else {
    466             /* unmatched trail surrogate */
    467             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    468         }
    469     }
    470 
    471     if(U_SUCCESS(*pErrorCode)) {
    472         /* check for a remaining source byte */
    473         if(length>0) {
    474             if(targetCapacity==0) {
    475                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    476             } else {
    477                 /* it must be length==1 because otherwise the above would have copied more */
    478                 cnv->toUBytes[cnv->toULength++]=*source++;
    479             }
    480         }
    481     }
    482 
    483     /* write back the updated pointers */
    484     pArgs->source=(const char *)source;
    485     pArgs->target=target;
    486     pArgs->offsets=offsets;
    487 }
    488 
    489 static UChar32
    490 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
    491     const uint8_t *s, *sourceLimit;
    492     UChar32 c;
    493 
    494     if(pArgs->converter->mode<8) {
    495         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
    496     }
    497 
    498     s=(const uint8_t *)pArgs->source;
    499     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
    500 
    501     if(s>=sourceLimit) {
    502         /* no input */
    503         *err=U_INDEX_OUTOFBOUNDS_ERROR;
    504         return 0xffff;
    505     }
    506 
    507     if(s+2>sourceLimit) {
    508         /* only one byte: truncated UChar */
    509         pArgs->converter->toUBytes[0]=*s++;
    510         pArgs->converter->toULength=1;
    511         pArgs->source=(const char *)s;
    512         *err = U_TRUNCATED_CHAR_FOUND;
    513         return 0xffff;
    514     }
    515 
    516     /* get one UChar */
    517     c=((UChar32)*s<<8)|s[1];
    518     s+=2;
    519 
    520     /* check for a surrogate pair */
    521     if(U_IS_SURROGATE(c)) {
    522         if(U16_IS_SURROGATE_LEAD(c)) {
    523             if(s+2<=sourceLimit) {
    524                 UChar trail;
    525 
    526                 /* get a second UChar and see if it is a trail surrogate */
    527                 trail=((UChar)*s<<8)|s[1];
    528                 if(U16_IS_TRAIL(trail)) {
    529                     c=U16_GET_SUPPLEMENTARY(c, trail);
    530                     s+=2;
    531                 } else {
    532                     /* unmatched lead surrogate */
    533                     c=-2;
    534                 }
    535             } else {
    536                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
    537                 uint8_t *bytes=pArgs->converter->toUBytes;
    538                 s-=2;
    539                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
    540                 do {
    541                     *bytes++=*s++;
    542                 } while(s<sourceLimit);
    543 
    544                 c=0xffff;
    545                 *err=U_TRUNCATED_CHAR_FOUND;
    546             }
    547         } else {
    548             /* unmatched trail surrogate */
    549             c=-2;
    550         }
    551 
    552         if(c<0) {
    553             /* write the unmatched surrogate */
    554             uint8_t *bytes=pArgs->converter->toUBytes;
    555             pArgs->converter->toULength=2;
    556             *bytes=*(s-2);
    557             bytes[1]=*(s-1);
    558 
    559             c=0xffff;
    560             *err=U_ILLEGAL_CHAR_FOUND;
    561         }
    562     }
    563 
    564     pArgs->source=(const char *)s;
    565     return c;
    566 }
    567 
    568 static void
    569 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) {
    570     if(choice<=UCNV_RESET_TO_UNICODE) {
    571         /* reset toUnicode state */
    572         if(UCNV_GET_VERSION(cnv)==0) {
    573             cnv->mode=8; /* no BOM handling */
    574         } else {
    575             cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */
    576         }
    577     }
    578     if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
    579         /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */
    580         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
    581     }
    582 }
    583 
    584 static void
    585 _UTF16BEOpen(UConverter *cnv,
    586              UConverterLoadArgs *pArgs,
    587              UErrorCode *pErrorCode) {
    588     if(UCNV_GET_VERSION(cnv)<=1) {
    589         _UTF16BEReset(cnv, UCNV_RESET_BOTH);
    590     } else {
    591         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    592     }
    593 }
    594 
    595 static const char *
    596 _UTF16BEGetName(const UConverter *cnv) {
    597     if(UCNV_GET_VERSION(cnv)==0) {
    598         return "UTF-16BE";
    599     } else {
    600         return "UTF-16BE,version=1";
    601     }
    602 }
    603 
    604 static const UConverterImpl _UTF16BEImpl={
    605     UCNV_UTF16_BigEndian,
    606 
    607     NULL,
    608     NULL,
    609 
    610     _UTF16BEOpen,
    611     NULL,
    612     _UTF16BEReset,
    613 
    614     _UTF16BEToUnicodeWithOffsets,
    615     _UTF16BEToUnicodeWithOffsets,
    616     _UTF16BEFromUnicodeWithOffsets,
    617     _UTF16BEFromUnicodeWithOffsets,
    618     _UTF16BEGetNextUChar,
    619 
    620     NULL,
    621     _UTF16BEGetName,
    622     NULL,
    623     NULL,
    624     ucnv_getNonSurrogateUnicodeSet
    625 };
    626 
    627 static const UConverterStaticData _UTF16BEStaticData={
    628     sizeof(UConverterStaticData),
    629     "UTF-16BE",
    630     1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2,
    631     { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE,
    632     0,
    633     0,
    634     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
    635 };
    636 
    637 
    638 const UConverterSharedData _UTF16BEData={
    639     sizeof(UConverterSharedData), ~((uint32_t) 0),
    640     NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl,
    641     0
    642 };
    643 
    644 /* UTF-16LE ----------------------------------------------------------------- */
    645 
    646 static void
    647 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
    648                                UErrorCode *pErrorCode) {
    649     UConverter *cnv;
    650     const UChar *source;
    651     char *target;
    652     int32_t *offsets;
    653 
    654     uint32_t targetCapacity, length, sourceIndex;
    655     UChar c, trail;
    656     char overflow[4];
    657 
    658     source=pArgs->source;
    659     length=(int32_t)(pArgs->sourceLimit-source);
    660     if(length<=0) {
    661         /* no input, nothing to do */
    662         return;
    663     }
    664 
    665     cnv=pArgs->converter;
    666 
    667     /* write the BOM if necessary */
    668     if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    669         static const char bom[]={ (char)0xff, (char)0xfe };
    670         ucnv_fromUWriteBytes(cnv,
    671                              bom, 2,
    672                              &pArgs->target, pArgs->targetLimit,
    673                              &pArgs->offsets, -1,
    674                              pErrorCode);
    675         cnv->fromUnicodeStatus=0;
    676     }
    677 
    678     target=pArgs->target;
    679     if(target >= pArgs->targetLimit) {
    680         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    681         return;
    682     }
    683 
    684     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
    685     offsets=pArgs->offsets;
    686     sourceIndex=0;
    687 
    688     /* c!=0 indicates in several places outside the main loops that a surrogate was found */
    689 
    690     if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) {
    691         /* the last buffer ended with a lead surrogate, output the surrogate pair */
    692         ++source;
    693         --length;
    694         target[0]=(uint8_t)c;
    695         target[1]=(uint8_t)(c>>8);
    696         target[2]=(uint8_t)trail;
    697         target[3]=(uint8_t)(trail>>8);
    698         target+=4;
    699         targetCapacity-=4;
    700         if(offsets!=NULL) {
    701             *offsets++=-1;
    702             *offsets++=-1;
    703             *offsets++=-1;
    704             *offsets++=-1;
    705         }
    706         sourceIndex=1;
    707         cnv->fromUChar32=c=0;
    708     }
    709 
    710     if(c==0) {
    711         /* copy an even number of bytes for complete UChars */
    712         uint32_t count=2*length;
    713         if(count>targetCapacity) {
    714             count=targetCapacity&~1;
    715         }
    716         /* count is even */
    717         targetCapacity-=count;
    718         count>>=1;
    719         length-=count;
    720 
    721         if(offsets==NULL) {
    722             while(count>0) {
    723                 c=*source++;
    724                 if(U16_IS_SINGLE(c)) {
    725                     target[0]=(uint8_t)c;
    726                     target[1]=(uint8_t)(c>>8);
    727                     target+=2;
    728                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
    729                     ++source;
    730                     --count;
    731                     target[0]=(uint8_t)c;
    732                     target[1]=(uint8_t)(c>>8);
    733                     target[2]=(uint8_t)trail;
    734                     target[3]=(uint8_t)(trail>>8);
    735                     target+=4;
    736                 } else {
    737                     break;
    738                 }
    739                 --count;
    740             }
    741         } else {
    742             while(count>0) {
    743                 c=*source++;
    744                 if(U16_IS_SINGLE(c)) {
    745                     target[0]=(uint8_t)c;
    746                     target[1]=(uint8_t)(c>>8);
    747                     target+=2;
    748                     *offsets++=sourceIndex;
    749                     *offsets++=sourceIndex++;
    750                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) {
    751                     ++source;
    752                     --count;
    753                     target[0]=(uint8_t)c;
    754                     target[1]=(uint8_t)(c>>8);
    755                     target[2]=(uint8_t)trail;
    756                     target[3]=(uint8_t)(trail>>8);
    757                     target+=4;
    758                     *offsets++=sourceIndex;
    759                     *offsets++=sourceIndex;
    760                     *offsets++=sourceIndex;
    761                     *offsets++=sourceIndex;
    762                     sourceIndex+=2;
    763                 } else {
    764                     break;
    765                 }
    766                 --count;
    767             }
    768         }
    769 
    770         if(count==0) {
    771             /* done with the loop for complete UChars */
    772             if(length>0 && targetCapacity>0) {
    773                 /*
    774                  * there is more input and some target capacity -
    775                  * it must be targetCapacity==1 because otherwise
    776                  * the above would have copied more;
    777                  * prepare for overflow output
    778                  */
    779                 if(U16_IS_SINGLE(c=*source++)) {
    780                     overflow[0]=(char)c;
    781                     overflow[1]=(char)(c>>8);
    782                     length=2; /* 2 bytes to output */
    783                     c=0;
    784                 /* } else { keep c for surrogate handling, length will be set there */
    785                 }
    786             } else {
    787                 length=0;
    788                 c=0;
    789             }
    790         } else {
    791             /* keep c for surrogate handling, length will be set there */
    792             targetCapacity+=2*count;
    793         }
    794     } else {
    795         length=0; /* from here on, length counts the bytes in overflow[] */
    796     }
    797 
    798     if(c!=0) {
    799         /*
    800          * c is a surrogate, and
    801          * - source or target too short
    802          * - or the surrogate is unmatched
    803          */
    804         length=0;
    805         if(U16_IS_SURROGATE_LEAD(c)) {
    806             if(source<pArgs->sourceLimit) {
    807                 if(U16_IS_TRAIL(trail=*source)) {
    808                     /* output the surrogate pair, will overflow (see conditions comment above) */
    809                     ++source;
    810                     overflow[0]=(char)c;
    811                     overflow[1]=(char)(c>>8);
    812                     overflow[2]=(char)trail;
    813                     overflow[3]=(char)(trail>>8);
    814                     length=4; /* 4 bytes to output */
    815                     c=0;
    816                 } else {
    817                     /* unmatched lead surrogate */
    818                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    819                 }
    820             } else {
    821                 /* see if the trail surrogate is in the next buffer */
    822             }
    823         } else {
    824             /* unmatched trail surrogate */
    825             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    826         }
    827         cnv->fromUChar32=c;
    828     }
    829 
    830     if(length>0) {
    831         /* output length bytes with overflow (length>targetCapacity>0) */
    832         ucnv_fromUWriteBytes(cnv,
    833                              overflow, length,
    834                              &target, pArgs->targetLimit,
    835                              &offsets, sourceIndex,
    836                              pErrorCode);
    837         targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target);
    838     }
    839 
    840     if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) {
    841         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    842     }
    843 
    844     /* write back the updated pointers */
    845     pArgs->source=source;
    846     pArgs->target=target;
    847     pArgs->offsets=offsets;
    848 }
    849 
    850 static void
    851 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    852                              UErrorCode *pErrorCode) {
    853     UConverter *cnv;
    854     const uint8_t *source;
    855     UChar *target;
    856     int32_t *offsets;
    857 
    858     uint32_t targetCapacity, length, count, sourceIndex;
    859     UChar c, trail;
    860 
    861     if(pArgs->converter->mode<8) {
    862         _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode);
    863         return;
    864     }
    865 
    866     cnv=pArgs->converter;
    867     source=(const uint8_t *)pArgs->source;
    868     length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source);
    869     if(length<=0 && cnv->toUnicodeStatus==0) {
    870         /* no input, nothing to do */
    871         return;
    872     }
    873 
    874     target=pArgs->target;
    875     if(target >= pArgs->targetLimit) {
    876         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    877         return;
    878     }
    879 
    880     targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target);
    881     offsets=pArgs->offsets;
    882     sourceIndex=0;
    883     c=0;
    884 
    885     /* complete a partial UChar or pair from the last call */
    886     if(cnv->toUnicodeStatus!=0) {
    887         /*
    888          * special case: single byte from a previous buffer,
    889          * where the byte turned out not to belong to a trail surrogate
    890          * and the preceding, unmatched lead surrogate was put into toUBytes[]
    891          * for error handling
    892          */
    893         cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus;
    894         cnv->toULength=1;
    895         cnv->toUnicodeStatus=0;
    896     }
    897     if((count=cnv->toULength)!=0) {
    898         uint8_t *p=cnv->toUBytes;
    899         do {
    900             p[count++]=*source++;
    901             ++sourceIndex;
    902             --length;
    903             if(count==2) {
    904                 c=((UChar)p[1]<<8)|p[0];
    905                 if(U16_IS_SINGLE(c)) {
    906                     /* output the BMP code point */
    907                     *target++=c;
    908                     if(offsets!=NULL) {
    909                         *offsets++=-1;
    910                     }
    911                     --targetCapacity;
    912                     count=0;
    913                     c=0;
    914                     break;
    915                 } else if(U16_IS_SURROGATE_LEAD(c)) {
    916                     /* continue collecting bytes for the trail surrogate */
    917                     c=0; /* avoid unnecessary surrogate handling below */
    918                 } else {
    919                     /* fall through to error handling for an unmatched trail surrogate */
    920                     break;
    921                 }
    922             } else if(count==4) {
    923                 c=((UChar)p[1]<<8)|p[0];
    924                 trail=((UChar)p[3]<<8)|p[2];
    925                 if(U16_IS_TRAIL(trail)) {
    926                     /* output the surrogate pair */
    927                     *target++=c;
    928                     if(targetCapacity>=2) {
    929                         *target++=trail;
    930                         if(offsets!=NULL) {
    931                             *offsets++=-1;
    932                             *offsets++=-1;
    933                         }
    934                         targetCapacity-=2;
    935                     } else /* targetCapacity==1 */ {
    936                         targetCapacity=0;
    937                         cnv->UCharErrorBuffer[0]=trail;
    938                         cnv->UCharErrorBufferLength=1;
    939                         *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    940                     }
    941                     count=0;
    942                     c=0;
    943                     break;
    944                 } else {
    945                     /* unmatched lead surrogate, handle here for consistent toUBytes[] */
    946                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
    947 
    948                     /* back out reading the code unit after it */
    949                     if(((const uint8_t *)pArgs->source-source)>=2) {
    950                         source-=2;
    951                     } else {
    952                         /*
    953                          * if the trail unit's first byte was in a previous buffer, then
    954                          * we need to put it into a special place because toUBytes[] will be
    955                          * used for the lead unit's bytes
    956                          */
    957                         cnv->toUnicodeStatus=0x100|p[2];
    958                         --source;
    959                     }
    960                     cnv->toULength=2;
    961 
    962                     /* write back the updated pointers */
    963                     pArgs->source=(const char *)source;
    964                     pArgs->target=target;
    965                     pArgs->offsets=offsets;
    966                     return;
    967                 }
    968             }
    969         } while(length>0);
    970         cnv->toULength=(int8_t)count;
    971     }
    972 
    973     /* copy an even number of bytes for complete UChars */
    974     count=2*targetCapacity;
    975     if(count>length) {
    976         count=length&~1;
    977     }
    978     if(c==0 && count>0) {
    979         length-=count;
    980         count>>=1;
    981         targetCapacity-=count;
    982         if(offsets==NULL) {
    983             do {
    984                 c=((UChar)source[1]<<8)|source[0];
    985                 source+=2;
    986                 if(U16_IS_SINGLE(c)) {
    987                     *target++=c;
    988                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
    989                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
    990                 ) {
    991                     source+=2;
    992                     --count;
    993                     *target++=c;
    994                     *target++=trail;
    995                 } else {
    996                     break;
    997                 }
    998             } while(--count>0);
    999         } else {
   1000             do {
   1001                 c=((UChar)source[1]<<8)|source[0];
   1002                 source+=2;
   1003                 if(U16_IS_SINGLE(c)) {
   1004                     *target++=c;
   1005                     *offsets++=sourceIndex;
   1006                     sourceIndex+=2;
   1007                 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 &&
   1008                           U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])
   1009                 ) {
   1010                     source+=2;
   1011                     --count;
   1012                     *target++=c;
   1013                     *target++=trail;
   1014                     *offsets++=sourceIndex;
   1015                     *offsets++=sourceIndex;
   1016                     sourceIndex+=4;
   1017                 } else {
   1018                     break;
   1019                 }
   1020             } while(--count>0);
   1021         }
   1022 
   1023         if(count==0) {
   1024             /* done with the loop for complete UChars */
   1025             c=0;
   1026         } else {
   1027             /* keep c for surrogate handling, trail will be set there */
   1028             length+=2*(count-1); /* one more byte pair was consumed than count decremented */
   1029             targetCapacity+=count;
   1030         }
   1031     }
   1032 
   1033     if(c!=0) {
   1034         /*
   1035          * c is a surrogate, and
   1036          * - source or target too short
   1037          * - or the surrogate is unmatched
   1038          */
   1039         cnv->toUBytes[0]=(uint8_t)c;
   1040         cnv->toUBytes[1]=(uint8_t)(c>>8);
   1041         cnv->toULength=2;
   1042 
   1043         if(U16_IS_SURROGATE_LEAD(c)) {
   1044             if(length>=2) {
   1045                 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) {
   1046                     /* output the surrogate pair, will overflow (see conditions comment above) */
   1047                     source+=2;
   1048                     length-=2;
   1049                     *target++=c;
   1050                     if(offsets!=NULL) {
   1051                         *offsets++=sourceIndex;
   1052                     }
   1053                     cnv->UCharErrorBuffer[0]=trail;
   1054                     cnv->UCharErrorBufferLength=1;
   1055                     cnv->toULength=0;
   1056                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1057                 } else {
   1058                     /* unmatched lead surrogate */
   1059                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1060                 }
   1061             } else {
   1062                 /* see if the trail surrogate is in the next buffer */
   1063             }
   1064         } else {
   1065             /* unmatched trail surrogate */
   1066             *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1067         }
   1068     }
   1069 
   1070     if(U_SUCCESS(*pErrorCode)) {
   1071         /* check for a remaining source byte */
   1072         if(length>0) {
   1073             if(targetCapacity==0) {
   1074                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1075             } else {
   1076                 /* it must be length==1 because otherwise the above would have copied more */
   1077                 cnv->toUBytes[cnv->toULength++]=*source++;
   1078             }
   1079         }
   1080     }
   1081 
   1082     /* write back the updated pointers */
   1083     pArgs->source=(const char *)source;
   1084     pArgs->target=target;
   1085     pArgs->offsets=offsets;
   1086 }
   1087 
   1088 static UChar32
   1089 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) {
   1090     const uint8_t *s, *sourceLimit;
   1091     UChar32 c;
   1092 
   1093     if(pArgs->converter->mode<8) {
   1094         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   1095     }
   1096 
   1097     s=(const uint8_t *)pArgs->source;
   1098     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   1099 
   1100     if(s>=sourceLimit) {
   1101         /* no input */
   1102         *err=U_INDEX_OUTOFBOUNDS_ERROR;
   1103         return 0xffff;
   1104     }
   1105 
   1106     if(s+2>sourceLimit) {
   1107         /* only one byte: truncated UChar */
   1108         pArgs->converter->toUBytes[0]=*s++;
   1109         pArgs->converter->toULength=1;
   1110         pArgs->source=(const char *)s;
   1111         *err = U_TRUNCATED_CHAR_FOUND;
   1112         return 0xffff;
   1113     }
   1114 
   1115     /* get one UChar */
   1116     c=((UChar32)s[1]<<8)|*s;
   1117     s+=2;
   1118 
   1119     /* check for a surrogate pair */
   1120     if(U_IS_SURROGATE(c)) {
   1121         if(U16_IS_SURROGATE_LEAD(c)) {
   1122             if(s+2<=sourceLimit) {
   1123                 UChar trail;
   1124 
   1125                 /* get a second UChar and see if it is a trail surrogate */
   1126                 trail=((UChar)s[1]<<8)|*s;
   1127                 if(U16_IS_TRAIL(trail)) {
   1128                     c=U16_GET_SUPPLEMENTARY(c, trail);
   1129                     s+=2;
   1130                 } else {
   1131                     /* unmatched lead surrogate */
   1132                     c=-2;
   1133                 }
   1134             } else {
   1135                 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */
   1136                 uint8_t *bytes=pArgs->converter->toUBytes;
   1137                 s-=2;
   1138                 pArgs->converter->toULength=(int8_t)(sourceLimit-s);
   1139                 do {
   1140                     *bytes++=*s++;
   1141                 } while(s<sourceLimit);
   1142 
   1143                 c=0xffff;
   1144                 *err=U_TRUNCATED_CHAR_FOUND;
   1145             }
   1146         } else {
   1147             /* unmatched trail surrogate */
   1148             c=-2;
   1149         }
   1150 
   1151         if(c<0) {
   1152             /* write the unmatched surrogate */
   1153             uint8_t *bytes=pArgs->converter->toUBytes;
   1154             pArgs->converter->toULength=2;
   1155             *bytes=*(s-2);
   1156             bytes[1]=*(s-1);
   1157 
   1158             c=0xffff;
   1159             *err=U_ILLEGAL_CHAR_FOUND;
   1160         }
   1161     }
   1162 
   1163     pArgs->source=(const char *)s;
   1164     return c;
   1165 }
   1166 
   1167 static void
   1168 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) {
   1169     if(choice<=UCNV_RESET_TO_UNICODE) {
   1170         /* reset toUnicode state */
   1171         if(UCNV_GET_VERSION(cnv)==0) {
   1172             cnv->mode=8; /* no BOM handling */
   1173         } else {
   1174             cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */
   1175         }
   1176     }
   1177     if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) {
   1178         /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */
   1179         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
   1180     }
   1181 }
   1182 
   1183 static void
   1184 _UTF16LEOpen(UConverter *cnv,
   1185              UConverterLoadArgs *pArgs,
   1186              UErrorCode *pErrorCode) {
   1187     if(UCNV_GET_VERSION(cnv)<=1) {
   1188         _UTF16LEReset(cnv, UCNV_RESET_BOTH);
   1189     } else {
   1190         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1191     }
   1192 }
   1193 
   1194 static const char *
   1195 _UTF16LEGetName(const UConverter *cnv) {
   1196     if(UCNV_GET_VERSION(cnv)==0) {
   1197         return "UTF-16LE";
   1198     } else {
   1199         return "UTF-16LE,version=1";
   1200     }
   1201 }
   1202 
   1203 static const UConverterImpl _UTF16LEImpl={
   1204     UCNV_UTF16_LittleEndian,
   1205 
   1206     NULL,
   1207     NULL,
   1208 
   1209     _UTF16LEOpen,
   1210     NULL,
   1211     _UTF16LEReset,
   1212 
   1213     _UTF16LEToUnicodeWithOffsets,
   1214     _UTF16LEToUnicodeWithOffsets,
   1215     _UTF16LEFromUnicodeWithOffsets,
   1216     _UTF16LEFromUnicodeWithOffsets,
   1217     _UTF16LEGetNextUChar,
   1218 
   1219     NULL,
   1220     _UTF16LEGetName,
   1221     NULL,
   1222     NULL,
   1223     ucnv_getNonSurrogateUnicodeSet
   1224 };
   1225 
   1226 
   1227 static const UConverterStaticData _UTF16LEStaticData={
   1228     sizeof(UConverterStaticData),
   1229     "UTF-16LE",
   1230     1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2,
   1231     { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE,
   1232     0,
   1233     0,
   1234     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1235 };
   1236 
   1237 
   1238 const UConverterSharedData _UTF16LEData={
   1239     sizeof(UConverterSharedData), ~((uint32_t) 0),
   1240     NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl,
   1241     0
   1242 };
   1243 
   1244 /* UTF-16 (Detect BOM) ------------------------------------------------------ */
   1245 
   1246 /*
   1247  * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE
   1248  * accordingly.
   1249  * This is a simpler version of the UTF-32 converter, with
   1250  * fewer states for shorter BOMs.
   1251  *
   1252  * State values:
   1253  * 0    initial state
   1254  * 1    saw first byte
   1255  * 2..5 -
   1256  * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1
   1257  * 8    UTF-16BE mode
   1258  * 9    UTF-16LE mode
   1259  *
   1260  * During detection: state==number of initial bytes seen so far.
   1261  *
   1262  * On output, emit U+FEFF as the first code point.
   1263  *
   1264  * Variants:
   1265  * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error.
   1266  * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and
   1267  *   UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error.
   1268  */
   1269 
   1270 static void
   1271 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) {
   1272     if(choice<=UCNV_RESET_TO_UNICODE) {
   1273         /* reset toUnicode: state=0 */
   1274         cnv->mode=0;
   1275     }
   1276     if(choice!=UCNV_RESET_TO_UNICODE) {
   1277         /* reset fromUnicode: prepare to output the UTF-16PE BOM */
   1278         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
   1279     }
   1280 }
   1281 
   1282 static const UConverterSharedData _UTF16v2Data;
   1283 
   1284 static void
   1285 _UTF16Open(UConverter *cnv,
   1286            UConverterLoadArgs *pArgs,
   1287            UErrorCode *pErrorCode) {
   1288     if(UCNV_GET_VERSION(cnv)<=2) {
   1289         if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) {
   1290             /*
   1291              * Switch implementation, and switch the staticData that's different
   1292              * and was copied into the UConverter.
   1293              * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.)
   1294              * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream.
   1295              */
   1296             cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data;
   1297             uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN);
   1298         }
   1299         _UTF16Reset(cnv, UCNV_RESET_BOTH);
   1300     } else {
   1301         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
   1302     }
   1303 }
   1304 
   1305 static const char *
   1306 _UTF16GetName(const UConverter *cnv) {
   1307     if(UCNV_GET_VERSION(cnv)==0) {
   1308         return "UTF-16";
   1309     } else if(UCNV_GET_VERSION(cnv)==1) {
   1310         return "UTF-16,version=1";
   1311     } else {
   1312         return "UTF-16,version=2";
   1313     }
   1314 }
   1315 
   1316 const UConverterSharedData _UTF16Data;
   1317 
   1318 #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData)
   1319 #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData)
   1320 #define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data || (cnv)->sharedData==&_UTF16v2Data)
   1321 
   1322 static void
   1323 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   1324                            UErrorCode *pErrorCode) {
   1325     UConverter *cnv=pArgs->converter;
   1326     const char *source=pArgs->source;
   1327     const char *sourceLimit=pArgs->sourceLimit;
   1328     int32_t *offsets=pArgs->offsets;
   1329 
   1330     int32_t state, offsetDelta;
   1331     uint8_t b;
   1332 
   1333     state=cnv->mode;
   1334 
   1335     /*
   1336      * If we detect a BOM in this buffer, then we must add the BOM size to the
   1337      * offsets because the actual converter function will not see and count the BOM.
   1338      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
   1339      */
   1340     offsetDelta=0;
   1341 
   1342     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
   1343         switch(state) {
   1344         case 0:
   1345             cnv->toUBytes[0]=(uint8_t)*source++;
   1346             cnv->toULength=1;
   1347             state=1;
   1348             break;
   1349         case 1:
   1350             /*
   1351              * Only inside this switch case can the state variable
   1352              * temporarily take two additional values:
   1353              * 6: BOM error, continue with BE
   1354              * 7: BOM error, continue with LE
   1355              */
   1356             b=*source;
   1357             if(cnv->toUBytes[0]==0xfe && b==0xff) {
   1358                 if(IS_UTF16LE(cnv)) {
   1359                     state=7; /* illegal reverse BOM for Java "UnicodeLittle" */
   1360                 } else {
   1361                     state=8; /* detect UTF-16BE */
   1362                 }
   1363             } else if(cnv->toUBytes[0]==0xff && b==0xfe) {
   1364                 if(IS_UTF16BE(cnv)) {
   1365                     state=6; /* illegal reverse BOM for Java "UnicodeBig" */
   1366                 } else {
   1367                     state=9; /* detect UTF-16LE */
   1368                 }
   1369             } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) {
   1370                 state=6; /* illegal missing BOM for Java "Unicode" */
   1371             }
   1372             if(state>=8) {
   1373                 /* BOM detected, consume it */
   1374                 ++source;
   1375                 cnv->toULength=0;
   1376                 offsetDelta=(int32_t)(source-pArgs->source);
   1377             } else if(state<6) {
   1378                 /* ok: no BOM, and not a reverse BOM */
   1379                 if(source!=pArgs->source) {
   1380                     /* reset the source for a correct first offset */
   1381                     source=pArgs->source;
   1382                     cnv->toULength=0;
   1383                 }
   1384                 if(IS_UTF16LE(cnv)) {
   1385                     /* Make Java "UnicodeLittle" default to LE. */
   1386                     state=9;
   1387                 } else {
   1388                     /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */
   1389                     state=8;
   1390                 }
   1391             } else {
   1392                 /*
   1393                  * error: missing BOM, or reverse BOM
   1394                  * UTF-16,version=1: Java-specific "Unicode" requires a BOM.
   1395                  * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM.
   1396                  * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM.
   1397                  */
   1398                 /* report the non-BOM or reverse BOM as an illegal sequence */
   1399                 cnv->toUBytes[1]=b;
   1400                 cnv->toULength=2;
   1401                 pArgs->source=source+1;
   1402                 /* continue with conversion if the callback resets the error */
   1403                 /*
   1404                  * Make Java "Unicode" default to BE like standard UTF-16.
   1405                  * Make Java "UnicodeBig" and "UnicodeLittle" default
   1406                  * to their normal endiannesses.
   1407                  */
   1408                 cnv->mode=state+2;
   1409                 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE;
   1410                 return;
   1411             }
   1412             /* convert the rest of the stream */
   1413             cnv->mode=state;
   1414             continue;
   1415         case 8:
   1416             /* call UTF-16BE */
   1417             pArgs->source=source;
   1418             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
   1419             source=pArgs->source;
   1420             break;
   1421         case 9:
   1422             /* call UTF-16LE */
   1423             pArgs->source=source;
   1424             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
   1425             source=pArgs->source;
   1426             break;
   1427         default:
   1428             break; /* does not occur */
   1429         }
   1430     }
   1431 
   1432     /* add BOM size to offsets - see comment at offsetDelta declaration */
   1433     if(offsets!=NULL && offsetDelta!=0) {
   1434         int32_t *offsetsLimit=pArgs->offsets;
   1435         while(offsets<offsetsLimit) {
   1436             *offsets++ += offsetDelta;
   1437         }
   1438     }
   1439 
   1440     pArgs->source=source;
   1441 
   1442     if(source==sourceLimit && pArgs->flush) {
   1443         /* handle truncated input */
   1444         switch(state) {
   1445         case 0:
   1446             break; /* no input at all, nothing to do */
   1447         case 8:
   1448             _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode);
   1449             break;
   1450         case 9:
   1451             _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode);
   1452             break;
   1453         default:
   1454             /* 0<state<8: framework will report truncation, nothing to do here */
   1455             break;
   1456         }
   1457     }
   1458 
   1459     cnv->mode=state;
   1460 }
   1461 
   1462 static UChar32
   1463 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs,
   1464                    UErrorCode *pErrorCode) {
   1465     switch(pArgs->converter->mode) {
   1466     case 8:
   1467         return _UTF16BEGetNextUChar(pArgs, pErrorCode);
   1468     case 9:
   1469         return _UTF16LEGetNextUChar(pArgs, pErrorCode);
   1470     default:
   1471         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   1472     }
   1473 }
   1474 
   1475 static const UConverterImpl _UTF16Impl = {
   1476     UCNV_UTF16,
   1477 
   1478     NULL,
   1479     NULL,
   1480 
   1481     _UTF16Open,
   1482     NULL,
   1483     _UTF16Reset,
   1484 
   1485     _UTF16ToUnicodeWithOffsets,
   1486     _UTF16ToUnicodeWithOffsets,
   1487     _UTF16PEFromUnicodeWithOffsets,
   1488     _UTF16PEFromUnicodeWithOffsets,
   1489     _UTF16GetNextUChar,
   1490 
   1491     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
   1492     _UTF16GetName,
   1493     NULL,
   1494     NULL,
   1495     ucnv_getNonSurrogateUnicodeSet
   1496 };
   1497 
   1498 static const UConverterStaticData _UTF16StaticData = {
   1499     sizeof(UConverterStaticData),
   1500     "UTF-16",
   1501     1204, /* CCSID for BOM sensitive UTF-16 */
   1502     UCNV_IBM, UCNV_UTF16, 2, 2,
   1503 #if U_IS_BIG_ENDIAN
   1504     { 0xff, 0xfd, 0, 0 }, 2,
   1505 #else
   1506     { 0xfd, 0xff, 0, 0 }, 2,
   1507 #endif
   1508     FALSE, FALSE,
   1509     0,
   1510     0,
   1511     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1512 };
   1513 
   1514 const UConverterSharedData _UTF16Data = {
   1515     sizeof(UConverterSharedData), ~((uint32_t) 0),
   1516     NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl,
   1517     0
   1518 };
   1519 
   1520 static const UConverterImpl _UTF16v2Impl = {
   1521     UCNV_UTF16,
   1522 
   1523     NULL,
   1524     NULL,
   1525 
   1526     _UTF16Open,
   1527     NULL,
   1528     _UTF16Reset,
   1529 
   1530     _UTF16ToUnicodeWithOffsets,
   1531     _UTF16ToUnicodeWithOffsets,
   1532     _UTF16BEFromUnicodeWithOffsets,
   1533     _UTF16BEFromUnicodeWithOffsets,
   1534     _UTF16GetNextUChar,
   1535 
   1536     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
   1537     _UTF16GetName,
   1538     NULL,
   1539     NULL,
   1540     ucnv_getNonSurrogateUnicodeSet
   1541 };
   1542 
   1543 static const UConverterStaticData _UTF16v2StaticData = {
   1544     sizeof(UConverterStaticData),
   1545     "UTF-16,version=2",
   1546     1204, /* CCSID for BOM sensitive UTF-16 */
   1547     UCNV_IBM, UCNV_UTF16, 2, 2,
   1548     { 0xff, 0xfd, 0, 0 }, 2,
   1549     FALSE, FALSE,
   1550     0,
   1551     0,
   1552     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1553 };
   1554 
   1555 static const UConverterSharedData _UTF16v2Data = {
   1556     sizeof(UConverterSharedData), ~((uint32_t) 0),
   1557     NULL, NULL, &_UTF16v2StaticData, FALSE, &_UTF16v2Impl,
   1558     0
   1559 };
   1560 
   1561 #endif
   1562