Home | History | Annotate | Download | only in common
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2002-2015, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   file name:  ucnv_u32.c
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2002jul01
     12 *   created by: Markus W. Scherer
     13 *
     14 *   UTF-32 converter implementation. Used to be in ucnv_utf.c.
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
     20 
     21 #include "unicode/ucnv.h"
     22 #include "unicode/utf.h"
     23 #include "ucnv_bld.h"
     24 #include "ucnv_cnv.h"
     25 #include "cmemory.h"
     26 
     27 #define MAXIMUM_UCS2            0x0000FFFF
     28 #define MAXIMUM_UTF             0x0010FFFF
     29 #define HALF_SHIFT              10
     30 #define HALF_BASE               0x0010000
     31 #define HALF_MASK               0x3FF
     32 #define SURROGATE_HIGH_START    0xD800
     33 #define SURROGATE_LOW_START     0xDC00
     34 
     35 /* -SURROGATE_LOW_START + HALF_BASE */
     36 #define SURROGATE_LOW_BASE      9216
     37 
     38 enum {
     39     UCNV_NEED_TO_WRITE_BOM=1
     40 };
     41 
     42 /* UTF-32BE ----------------------------------------------------------------- */
     43 
     44 static void
     45 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
     46                                 UErrorCode * err)
     47 {
     48     const unsigned char *mySource = (unsigned char *) args->source;
     49     UChar *myTarget = args->target;
     50     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
     51     const UChar *targetLimit = args->targetLimit;
     52     unsigned char *toUBytes = args->converter->toUBytes;
     53     uint32_t ch, i;
     54 
     55     /* Restore state of current sequence */
     56     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
     57         i = args->converter->toULength;       /* restore # of bytes consumed */
     58         args->converter->toULength = 0;
     59 
     60         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
     61         args->converter->toUnicodeStatus = 0;
     62         goto morebytes;
     63     }
     64 
     65     while (mySource < sourceLimit && myTarget < targetLimit) {
     66         i = 0;
     67         ch = 0;
     68 morebytes:
     69         while (i < sizeof(uint32_t)) {
     70             if (mySource < sourceLimit) {
     71                 ch = (ch << 8) | (uint8_t)(*mySource);
     72                 toUBytes[i++] = (char) *(mySource++);
     73             }
     74             else {
     75                 /* stores a partially calculated target*/
     76                 /* + 1 to make 0 a valid character */
     77                 args->converter->toUnicodeStatus = ch + 1;
     78                 args->converter->toULength = (int8_t) i;
     79                 goto donefornow;
     80             }
     81         }
     82 
     83         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
     84             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
     85             if (ch <= MAXIMUM_UCS2)
     86             {
     87                 /* fits in 16 bits */
     88                 *(myTarget++) = (UChar) ch;
     89             }
     90             else {
     91                 /* write out the surrogates */
     92                 *(myTarget++) = U16_LEAD(ch);
     93                 ch = U16_TRAIL(ch);
     94                 if (myTarget < targetLimit) {
     95                     *(myTarget++) = (UChar)ch;
     96                 }
     97                 else {
     98                     /* Put in overflow buffer (not handled here) */
     99                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
    100                     args->converter->UCharErrorBufferLength = 1;
    101                     *err = U_BUFFER_OVERFLOW_ERROR;
    102                     break;
    103                 }
    104             }
    105         }
    106         else {
    107             args->converter->toULength = (int8_t)i;
    108             *err = U_ILLEGAL_CHAR_FOUND;
    109             break;
    110         }
    111     }
    112 
    113 donefornow:
    114     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
    115         /* End of target buffer */
    116         *err = U_BUFFER_OVERFLOW_ERROR;
    117     }
    118 
    119     args->target = myTarget;
    120     args->source = (const char *) mySource;
    121 }
    122 
    123 static void
    124 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
    125                                              UErrorCode * err)
    126 {
    127     const unsigned char *mySource = (unsigned char *) args->source;
    128     UChar *myTarget = args->target;
    129     int32_t *myOffsets = args->offsets;
    130     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    131     const UChar *targetLimit = args->targetLimit;
    132     unsigned char *toUBytes = args->converter->toUBytes;
    133     uint32_t ch, i;
    134     int32_t offsetNum = 0;
    135 
    136     /* Restore state of current sequence */
    137     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
    138         i = args->converter->toULength;       /* restore # of bytes consumed */
    139         args->converter->toULength = 0;
    140 
    141         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
    142         args->converter->toUnicodeStatus = 0;
    143         goto morebytes;
    144     }
    145 
    146     while (mySource < sourceLimit && myTarget < targetLimit) {
    147         i = 0;
    148         ch = 0;
    149 morebytes:
    150         while (i < sizeof(uint32_t)) {
    151             if (mySource < sourceLimit) {
    152                 ch = (ch << 8) | (uint8_t)(*mySource);
    153                 toUBytes[i++] = (char) *(mySource++);
    154             }
    155             else {
    156                 /* stores a partially calculated target*/
    157                 /* + 1 to make 0 a valid character */
    158                 args->converter->toUnicodeStatus = ch + 1;
    159                 args->converter->toULength = (int8_t) i;
    160                 goto donefornow;
    161             }
    162         }
    163 
    164         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
    165             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    166             if (ch <= MAXIMUM_UCS2) {
    167                 /* fits in 16 bits */
    168                 *(myTarget++) = (UChar) ch;
    169                 *(myOffsets++) = offsetNum;
    170             }
    171             else {
    172                 /* write out the surrogates */
    173                 *(myTarget++) = U16_LEAD(ch);
    174                 *myOffsets++ = offsetNum;
    175                 ch = U16_TRAIL(ch);
    176                 if (myTarget < targetLimit)
    177                 {
    178                     *(myTarget++) = (UChar)ch;
    179                     *(myOffsets++) = offsetNum;
    180                 }
    181                 else {
    182                     /* Put in overflow buffer (not handled here) */
    183                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
    184                     args->converter->UCharErrorBufferLength = 1;
    185                     *err = U_BUFFER_OVERFLOW_ERROR;
    186                     break;
    187                 }
    188             }
    189         }
    190         else {
    191             args->converter->toULength = (int8_t)i;
    192             *err = U_ILLEGAL_CHAR_FOUND;
    193             break;
    194         }
    195         offsetNum += i;
    196     }
    197 
    198 donefornow:
    199     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    200     {
    201         /* End of target buffer */
    202         *err = U_BUFFER_OVERFLOW_ERROR;
    203     }
    204 
    205     args->target = myTarget;
    206     args->source = (const char *) mySource;
    207     args->offsets = myOffsets;
    208 }
    209 
    210 static void
    211 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
    212                                   UErrorCode * err)
    213 {
    214     const UChar *mySource = args->source;
    215     unsigned char *myTarget;
    216     const UChar *sourceLimit = args->sourceLimit;
    217     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    218     UChar32 ch, ch2;
    219     unsigned int indexToWrite;
    220     unsigned char temp[sizeof(uint32_t)];
    221 
    222     if(mySource >= sourceLimit) {
    223         /* no input, nothing to do */
    224         return;
    225     }
    226 
    227     /* write the BOM if necessary */
    228     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    229         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
    230         ucnv_fromUWriteBytes(args->converter,
    231                              bom, 4,
    232                              &args->target, args->targetLimit,
    233                              &args->offsets, -1,
    234                              err);
    235         args->converter->fromUnicodeStatus=0;
    236     }
    237 
    238     myTarget = (unsigned char *) args->target;
    239     temp[0] = 0;
    240 
    241     if (args->converter->fromUChar32) {
    242         ch = args->converter->fromUChar32;
    243         args->converter->fromUChar32 = 0;
    244         goto lowsurogate;
    245     }
    246 
    247     while (mySource < sourceLimit && myTarget < targetLimit) {
    248         ch = *(mySource++);
    249 
    250         if (U_IS_SURROGATE(ch)) {
    251             if (U_IS_LEAD(ch)) {
    252 lowsurogate:
    253                 if (mySource < sourceLimit) {
    254                     ch2 = *mySource;
    255                     if (U_IS_TRAIL(ch2)) {
    256                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
    257                         mySource++;
    258                     }
    259                     else {
    260                         /* this is an unmatched trail code unit (2nd surrogate) */
    261                         /* callback(illegal) */
    262                         args->converter->fromUChar32 = ch;
    263                         *err = U_ILLEGAL_CHAR_FOUND;
    264                         break;
    265                     }
    266                 }
    267                 else {
    268                     /* ran out of source */
    269                     args->converter->fromUChar32 = ch;
    270                     if (args->flush) {
    271                         /* this is an unmatched trail code unit (2nd surrogate) */
    272                         /* callback(illegal) */
    273                         *err = U_ILLEGAL_CHAR_FOUND;
    274                     }
    275                     break;
    276                 }
    277             }
    278             else {
    279                 /* this is an unmatched trail code unit (2nd surrogate) */
    280                 /* callback(illegal) */
    281                 args->converter->fromUChar32 = ch;
    282                 *err = U_ILLEGAL_CHAR_FOUND;
    283                 break;
    284             }
    285         }
    286 
    287         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    288         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
    289         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
    290         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
    291 
    292         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
    293             if (myTarget < targetLimit) {
    294                 *(myTarget++) = temp[indexToWrite];
    295             }
    296             else {
    297                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
    298                 *err = U_BUFFER_OVERFLOW_ERROR;
    299             }
    300         }
    301     }
    302 
    303     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
    304         *err = U_BUFFER_OVERFLOW_ERROR;
    305     }
    306 
    307     args->target = (char *) myTarget;
    308     args->source = mySource;
    309 }
    310 
    311 static void
    312 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
    313                                                UErrorCode * err)
    314 {
    315     const UChar *mySource = args->source;
    316     unsigned char *myTarget;
    317     int32_t *myOffsets;
    318     const UChar *sourceLimit = args->sourceLimit;
    319     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    320     UChar32 ch, ch2;
    321     int32_t offsetNum = 0;
    322     unsigned int indexToWrite;
    323     unsigned char temp[sizeof(uint32_t)];
    324 
    325     if(mySource >= sourceLimit) {
    326         /* no input, nothing to do */
    327         return;
    328     }
    329 
    330     /* write the BOM if necessary */
    331     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    332         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
    333         ucnv_fromUWriteBytes(args->converter,
    334                              bom, 4,
    335                              &args->target, args->targetLimit,
    336                              &args->offsets, -1,
    337                              err);
    338         args->converter->fromUnicodeStatus=0;
    339     }
    340 
    341     myTarget = (unsigned char *) args->target;
    342     myOffsets = args->offsets;
    343     temp[0] = 0;
    344 
    345     if (args->converter->fromUChar32) {
    346         ch = args->converter->fromUChar32;
    347         args->converter->fromUChar32 = 0;
    348         goto lowsurogate;
    349     }
    350 
    351     while (mySource < sourceLimit && myTarget < targetLimit) {
    352         ch = *(mySource++);
    353 
    354         if (U_IS_SURROGATE(ch)) {
    355             if (U_IS_LEAD(ch)) {
    356 lowsurogate:
    357                 if (mySource < sourceLimit) {
    358                     ch2 = *mySource;
    359                     if (U_IS_TRAIL(ch2)) {
    360                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
    361                         mySource++;
    362                     }
    363                     else {
    364                         /* this is an unmatched trail code unit (2nd surrogate) */
    365                         /* callback(illegal) */
    366                         args->converter->fromUChar32 = ch;
    367                         *err = U_ILLEGAL_CHAR_FOUND;
    368                         break;
    369                     }
    370                 }
    371                 else {
    372                     /* ran out of source */
    373                     args->converter->fromUChar32 = ch;
    374                     if (args->flush) {
    375                         /* this is an unmatched trail code unit (2nd surrogate) */
    376                         /* callback(illegal) */
    377                         *err = U_ILLEGAL_CHAR_FOUND;
    378                     }
    379                     break;
    380                 }
    381             }
    382             else {
    383                 /* this is an unmatched trail code unit (2nd surrogate) */
    384                 /* callback(illegal) */
    385                 args->converter->fromUChar32 = ch;
    386                 *err = U_ILLEGAL_CHAR_FOUND;
    387                 break;
    388             }
    389         }
    390 
    391         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    392         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
    393         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
    394         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
    395 
    396         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
    397             if (myTarget < targetLimit) {
    398                 *(myTarget++) = temp[indexToWrite];
    399                 *(myOffsets++) = offsetNum;
    400             }
    401             else {
    402                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
    403                 *err = U_BUFFER_OVERFLOW_ERROR;
    404             }
    405         }
    406         offsetNum = offsetNum + 1 + (temp[1] != 0);
    407     }
    408 
    409     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
    410         *err = U_BUFFER_OVERFLOW_ERROR;
    411     }
    412 
    413     args->target = (char *) myTarget;
    414     args->source = mySource;
    415     args->offsets = myOffsets;
    416 }
    417 
    418 static UChar32
    419 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
    420                                    UErrorCode* err)
    421 {
    422     const uint8_t *mySource;
    423     UChar32 myUChar;
    424     int32_t length;
    425 
    426     mySource = (const uint8_t *)args->source;
    427     if (mySource >= (const uint8_t *)args->sourceLimit)
    428     {
    429         /* no input */
    430         *err = U_INDEX_OUTOFBOUNDS_ERROR;
    431         return 0xffff;
    432     }
    433 
    434     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
    435     if (length < 4)
    436     {
    437         /* got a partial character */
    438         uprv_memcpy(args->converter->toUBytes, mySource, length);
    439         args->converter->toULength = (int8_t)length;
    440         args->source = (const char *)(mySource + length);
    441         *err = U_TRUNCATED_CHAR_FOUND;
    442         return 0xffff;
    443     }
    444 
    445     /* Don't even try to do a direct cast because the value may be on an odd address. */
    446     myUChar = ((UChar32)mySource[0] << 24)
    447             | ((UChar32)mySource[1] << 16)
    448             | ((UChar32)mySource[2] << 8)
    449             | ((UChar32)mySource[3]);
    450 
    451     args->source = (const char *)(mySource + 4);
    452     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
    453         return myUChar;
    454     }
    455 
    456     uprv_memcpy(args->converter->toUBytes, mySource, 4);
    457     args->converter->toULength = 4;
    458 
    459     *err = U_ILLEGAL_CHAR_FOUND;
    460     return 0xffff;
    461 }
    462 
    463 static const UConverterImpl _UTF32BEImpl = {
    464     UCNV_UTF32_BigEndian,
    465 
    466     NULL,
    467     NULL,
    468 
    469     NULL,
    470     NULL,
    471     NULL,
    472 
    473     T_UConverter_toUnicode_UTF32_BE,
    474     T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
    475     T_UConverter_fromUnicode_UTF32_BE,
    476     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
    477     T_UConverter_getNextUChar_UTF32_BE,
    478 
    479     NULL,
    480     NULL,
    481     NULL,
    482     NULL,
    483     ucnv_getNonSurrogateUnicodeSet
    484 };
    485 
    486 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
    487 static const UConverterStaticData _UTF32BEStaticData = {
    488     sizeof(UConverterStaticData),
    489     "UTF-32BE",
    490     1232,
    491     UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
    492     { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
    493     0,
    494     0,
    495     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
    496 };
    497 
    498 const UConverterSharedData _UTF32BEData =
    499         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32BEStaticData, &_UTF32BEImpl);
    500 
    501 /* UTF-32LE ---------------------------------------------------------- */
    502 
    503 static void
    504 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
    505                                 UErrorCode * err)
    506 {
    507     const unsigned char *mySource = (unsigned char *) args->source;
    508     UChar *myTarget = args->target;
    509     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    510     const UChar *targetLimit = args->targetLimit;
    511     unsigned char *toUBytes = args->converter->toUBytes;
    512     uint32_t ch, i;
    513 
    514     /* Restore state of current sequence */
    515     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
    516     {
    517         i = args->converter->toULength;       /* restore # of bytes consumed */
    518         args->converter->toULength = 0;
    519 
    520         /* Stores the previously calculated ch from a previous call*/
    521         ch = args->converter->toUnicodeStatus - 1;
    522         args->converter->toUnicodeStatus = 0;
    523         goto morebytes;
    524     }
    525 
    526     while (mySource < sourceLimit && myTarget < targetLimit)
    527     {
    528         i = 0;
    529         ch = 0;
    530 morebytes:
    531         while (i < sizeof(uint32_t))
    532         {
    533             if (mySource < sourceLimit)
    534             {
    535                 ch |= ((uint8_t)(*mySource)) << (i * 8);
    536                 toUBytes[i++] = (char) *(mySource++);
    537             }
    538             else
    539             {
    540                 /* stores a partially calculated target*/
    541                 /* + 1 to make 0 a valid character */
    542                 args->converter->toUnicodeStatus = ch + 1;
    543                 args->converter->toULength = (int8_t) i;
    544                 goto donefornow;
    545             }
    546         }
    547 
    548         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
    549             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    550             if (ch <= MAXIMUM_UCS2) {
    551                 /* fits in 16 bits */
    552                 *(myTarget++) = (UChar) ch;
    553             }
    554             else {
    555                 /* write out the surrogates */
    556                 *(myTarget++) = U16_LEAD(ch);
    557                 ch = U16_TRAIL(ch);
    558                 if (myTarget < targetLimit) {
    559                     *(myTarget++) = (UChar)ch;
    560                 }
    561                 else {
    562                     /* Put in overflow buffer (not handled here) */
    563                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
    564                     args->converter->UCharErrorBufferLength = 1;
    565                     *err = U_BUFFER_OVERFLOW_ERROR;
    566                     break;
    567                 }
    568             }
    569         }
    570         else {
    571             args->converter->toULength = (int8_t)i;
    572             *err = U_ILLEGAL_CHAR_FOUND;
    573             break;
    574         }
    575     }
    576 
    577 donefornow:
    578     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    579     {
    580         /* End of target buffer */
    581         *err = U_BUFFER_OVERFLOW_ERROR;
    582     }
    583 
    584     args->target = myTarget;
    585     args->source = (const char *) mySource;
    586 }
    587 
    588 static void
    589 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
    590                                              UErrorCode * err)
    591 {
    592     const unsigned char *mySource = (unsigned char *) args->source;
    593     UChar *myTarget = args->target;
    594     int32_t *myOffsets = args->offsets;
    595     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    596     const UChar *targetLimit = args->targetLimit;
    597     unsigned char *toUBytes = args->converter->toUBytes;
    598     uint32_t ch, i;
    599     int32_t offsetNum = 0;
    600 
    601     /* Restore state of current sequence */
    602     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
    603     {
    604         i = args->converter->toULength;       /* restore # of bytes consumed */
    605         args->converter->toULength = 0;
    606 
    607         /* Stores the previously calculated ch from a previous call*/
    608         ch = args->converter->toUnicodeStatus - 1;
    609         args->converter->toUnicodeStatus = 0;
    610         goto morebytes;
    611     }
    612 
    613     while (mySource < sourceLimit && myTarget < targetLimit)
    614     {
    615         i = 0;
    616         ch = 0;
    617 morebytes:
    618         while (i < sizeof(uint32_t))
    619         {
    620             if (mySource < sourceLimit)
    621             {
    622                 ch |= ((uint8_t)(*mySource)) << (i * 8);
    623                 toUBytes[i++] = (char) *(mySource++);
    624             }
    625             else
    626             {
    627                 /* stores a partially calculated target*/
    628                 /* + 1 to make 0 a valid character */
    629                 args->converter->toUnicodeStatus = ch + 1;
    630                 args->converter->toULength = (int8_t) i;
    631                 goto donefornow;
    632             }
    633         }
    634 
    635         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
    636         {
    637             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    638             if (ch <= MAXIMUM_UCS2)
    639             {
    640                 /* fits in 16 bits */
    641                 *(myTarget++) = (UChar) ch;
    642                 *(myOffsets++) = offsetNum;
    643             }
    644             else {
    645                 /* write out the surrogates */
    646                 *(myTarget++) = U16_LEAD(ch);
    647                 *(myOffsets++) = offsetNum;
    648                 ch = U16_TRAIL(ch);
    649                 if (myTarget < targetLimit)
    650                 {
    651                     *(myTarget++) = (UChar)ch;
    652                     *(myOffsets++) = offsetNum;
    653                 }
    654                 else
    655                 {
    656                     /* Put in overflow buffer (not handled here) */
    657                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
    658                     args->converter->UCharErrorBufferLength = 1;
    659                     *err = U_BUFFER_OVERFLOW_ERROR;
    660                     break;
    661                 }
    662             }
    663         }
    664         else
    665         {
    666             args->converter->toULength = (int8_t)i;
    667             *err = U_ILLEGAL_CHAR_FOUND;
    668             break;
    669         }
    670         offsetNum += i;
    671     }
    672 
    673 donefornow:
    674     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    675     {
    676         /* End of target buffer */
    677         *err = U_BUFFER_OVERFLOW_ERROR;
    678     }
    679 
    680     args->target = myTarget;
    681     args->source = (const char *) mySource;
    682     args->offsets = myOffsets;
    683 }
    684 
    685 static void
    686 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
    687                                   UErrorCode * err)
    688 {
    689     const UChar *mySource = args->source;
    690     unsigned char *myTarget;
    691     const UChar *sourceLimit = args->sourceLimit;
    692     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    693     UChar32 ch, ch2;
    694     unsigned int indexToWrite;
    695     unsigned char temp[sizeof(uint32_t)];
    696 
    697     if(mySource >= sourceLimit) {
    698         /* no input, nothing to do */
    699         return;
    700     }
    701 
    702     /* write the BOM if necessary */
    703     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    704         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
    705         ucnv_fromUWriteBytes(args->converter,
    706                              bom, 4,
    707                              &args->target, args->targetLimit,
    708                              &args->offsets, -1,
    709                              err);
    710         args->converter->fromUnicodeStatus=0;
    711     }
    712 
    713     myTarget = (unsigned char *) args->target;
    714     temp[3] = 0;
    715 
    716     if (args->converter->fromUChar32)
    717     {
    718         ch = args->converter->fromUChar32;
    719         args->converter->fromUChar32 = 0;
    720         goto lowsurogate;
    721     }
    722 
    723     while (mySource < sourceLimit && myTarget < targetLimit)
    724     {
    725         ch = *(mySource++);
    726 
    727         if (U16_IS_SURROGATE(ch)) {
    728             if (U16_IS_LEAD(ch))
    729             {
    730 lowsurogate:
    731                 if (mySource < sourceLimit)
    732                 {
    733                     ch2 = *mySource;
    734                     if (U16_IS_TRAIL(ch2)) {
    735                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
    736                         mySource++;
    737                     }
    738                     else {
    739                         /* this is an unmatched trail code unit (2nd surrogate) */
    740                         /* callback(illegal) */
    741                         args->converter->fromUChar32 = ch;
    742                         *err = U_ILLEGAL_CHAR_FOUND;
    743                         break;
    744                     }
    745                 }
    746                 else {
    747                     /* ran out of source */
    748                     args->converter->fromUChar32 = ch;
    749                     if (args->flush) {
    750                         /* this is an unmatched trail code unit (2nd surrogate) */
    751                         /* callback(illegal) */
    752                         *err = U_ILLEGAL_CHAR_FOUND;
    753                     }
    754                     break;
    755                 }
    756             }
    757             else {
    758                 /* this is an unmatched trail code unit (2nd surrogate) */
    759                 /* callback(illegal) */
    760                 args->converter->fromUChar32 = ch;
    761                 *err = U_ILLEGAL_CHAR_FOUND;
    762                 break;
    763             }
    764         }
    765 
    766         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    767         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
    768         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
    769         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
    770 
    771         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
    772         {
    773             if (myTarget < targetLimit)
    774             {
    775                 *(myTarget++) = temp[indexToWrite];
    776             }
    777             else
    778             {
    779                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
    780                 *err = U_BUFFER_OVERFLOW_ERROR;
    781             }
    782         }
    783     }
    784 
    785     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    786     {
    787         *err = U_BUFFER_OVERFLOW_ERROR;
    788     }
    789 
    790     args->target = (char *) myTarget;
    791     args->source = mySource;
    792 }
    793 
    794 static void
    795 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
    796                                                UErrorCode * err)
    797 {
    798     const UChar *mySource = args->source;
    799     unsigned char *myTarget;
    800     int32_t *myOffsets;
    801     const UChar *sourceLimit = args->sourceLimit;
    802     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    803     UChar32 ch, ch2;
    804     unsigned int indexToWrite;
    805     unsigned char temp[sizeof(uint32_t)];
    806     int32_t offsetNum = 0;
    807 
    808     if(mySource >= sourceLimit) {
    809         /* no input, nothing to do */
    810         return;
    811     }
    812 
    813     /* write the BOM if necessary */
    814     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    815         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
    816         ucnv_fromUWriteBytes(args->converter,
    817                              bom, 4,
    818                              &args->target, args->targetLimit,
    819                              &args->offsets, -1,
    820                              err);
    821         args->converter->fromUnicodeStatus=0;
    822     }
    823 
    824     myTarget = (unsigned char *) args->target;
    825     myOffsets = args->offsets;
    826     temp[3] = 0;
    827 
    828     if (args->converter->fromUChar32)
    829     {
    830         ch = args->converter->fromUChar32;
    831         args->converter->fromUChar32 = 0;
    832         goto lowsurogate;
    833     }
    834 
    835     while (mySource < sourceLimit && myTarget < targetLimit)
    836     {
    837         ch = *(mySource++);
    838 
    839         if (U16_IS_SURROGATE(ch)) {
    840             if (U16_IS_LEAD(ch))
    841             {
    842 lowsurogate:
    843                 if (mySource < sourceLimit)
    844                 {
    845                     ch2 = *mySource;
    846                     if (U16_IS_TRAIL(ch2))
    847                     {
    848                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
    849                         mySource++;
    850                     }
    851                     else {
    852                         /* this is an unmatched trail code unit (2nd surrogate) */
    853                         /* callback(illegal) */
    854                         args->converter->fromUChar32 = ch;
    855                         *err = U_ILLEGAL_CHAR_FOUND;
    856                         break;
    857                     }
    858                 }
    859                 else {
    860                     /* ran out of source */
    861                     args->converter->fromUChar32 = ch;
    862                     if (args->flush) {
    863                         /* this is an unmatched trail code unit (2nd surrogate) */
    864                         /* callback(illegal) */
    865                         *err = U_ILLEGAL_CHAR_FOUND;
    866                     }
    867                     break;
    868                 }
    869             }
    870             else {
    871                 /* this is an unmatched trail code unit (2nd surrogate) */
    872                 /* callback(illegal) */
    873                 args->converter->fromUChar32 = ch;
    874                 *err = U_ILLEGAL_CHAR_FOUND;
    875                 break;
    876             }
    877         }
    878 
    879         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    880         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
    881         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
    882         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
    883 
    884         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
    885         {
    886             if (myTarget < targetLimit)
    887             {
    888                 *(myTarget++) = temp[indexToWrite];
    889                 *(myOffsets++) = offsetNum;
    890             }
    891             else
    892             {
    893                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
    894                 *err = U_BUFFER_OVERFLOW_ERROR;
    895             }
    896         }
    897         offsetNum = offsetNum + 1 + (temp[2] != 0);
    898     }
    899 
    900     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    901     {
    902         *err = U_BUFFER_OVERFLOW_ERROR;
    903     }
    904 
    905     args->target = (char *) myTarget;
    906     args->source = mySource;
    907     args->offsets = myOffsets;
    908 }
    909 
    910 static UChar32
    911 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
    912                                    UErrorCode* err)
    913 {
    914     const uint8_t *mySource;
    915     UChar32 myUChar;
    916     int32_t length;
    917 
    918     mySource = (const uint8_t *)args->source;
    919     if (mySource >= (const uint8_t *)args->sourceLimit)
    920     {
    921         /* no input */
    922         *err = U_INDEX_OUTOFBOUNDS_ERROR;
    923         return 0xffff;
    924     }
    925 
    926     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
    927     if (length < 4)
    928     {
    929         /* got a partial character */
    930         uprv_memcpy(args->converter->toUBytes, mySource, length);
    931         args->converter->toULength = (int8_t)length;
    932         args->source = (const char *)(mySource + length);
    933         *err = U_TRUNCATED_CHAR_FOUND;
    934         return 0xffff;
    935     }
    936 
    937     /* Don't even try to do a direct cast because the value may be on an odd address. */
    938     myUChar = ((UChar32)mySource[3] << 24)
    939             | ((UChar32)mySource[2] << 16)
    940             | ((UChar32)mySource[1] << 8)
    941             | ((UChar32)mySource[0]);
    942 
    943     args->source = (const char *)(mySource + 4);
    944     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
    945         return myUChar;
    946     }
    947 
    948     uprv_memcpy(args->converter->toUBytes, mySource, 4);
    949     args->converter->toULength = 4;
    950 
    951     *err = U_ILLEGAL_CHAR_FOUND;
    952     return 0xffff;
    953 }
    954 
    955 static const UConverterImpl _UTF32LEImpl = {
    956     UCNV_UTF32_LittleEndian,
    957 
    958     NULL,
    959     NULL,
    960 
    961     NULL,
    962     NULL,
    963     NULL,
    964 
    965     T_UConverter_toUnicode_UTF32_LE,
    966     T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
    967     T_UConverter_fromUnicode_UTF32_LE,
    968     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
    969     T_UConverter_getNextUChar_UTF32_LE,
    970 
    971     NULL,
    972     NULL,
    973     NULL,
    974     NULL,
    975     ucnv_getNonSurrogateUnicodeSet
    976 };
    977 
    978 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
    979 static const UConverterStaticData _UTF32LEStaticData = {
    980     sizeof(UConverterStaticData),
    981     "UTF-32LE",
    982     1234,
    983     UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
    984     { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
    985     0,
    986     0,
    987     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
    988 };
    989 
    990 
    991 const UConverterSharedData _UTF32LEData =
    992         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32LEStaticData, &_UTF32LEImpl);
    993 
    994 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
    995 
    996 /*
    997  * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
    998  * accordingly.
    999  *
   1000  * State values:
   1001  * 0    initial state
   1002  * 1    saw 00
   1003  * 2    saw 00 00
   1004  * 3    saw 00 00 FE
   1005  * 4    -
   1006  * 5    saw FF
   1007  * 6    saw FF FE
   1008  * 7    saw FF FE 00
   1009  * 8    UTF-32BE mode
   1010  * 9    UTF-32LE mode
   1011  *
   1012  * During detection: state&3==number of matching bytes so far.
   1013  *
   1014  * On output, emit U+FEFF as the first code point.
   1015  */
   1016 
   1017 static void
   1018 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
   1019     if(choice<=UCNV_RESET_TO_UNICODE) {
   1020         /* reset toUnicode: state=0 */
   1021         cnv->mode=0;
   1022     }
   1023     if(choice!=UCNV_RESET_TO_UNICODE) {
   1024         /* reset fromUnicode: prepare to output the UTF-32PE BOM */
   1025         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
   1026     }
   1027 }
   1028 
   1029 static void
   1030 _UTF32Open(UConverter *cnv,
   1031            UConverterLoadArgs *pArgs,
   1032            UErrorCode *pErrorCode) {
   1033     _UTF32Reset(cnv, UCNV_RESET_BOTH);
   1034 }
   1035 
   1036 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff,    (char)0xff, (char)0xfe, 0, 0 };
   1037 
   1038 static void
   1039 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   1040                            UErrorCode *pErrorCode) {
   1041     UConverter *cnv=pArgs->converter;
   1042     const char *source=pArgs->source;
   1043     const char *sourceLimit=pArgs->sourceLimit;
   1044     int32_t *offsets=pArgs->offsets;
   1045 
   1046     int32_t state, offsetDelta;
   1047     char b;
   1048 
   1049     state=cnv->mode;
   1050 
   1051     /*
   1052      * If we detect a BOM in this buffer, then we must add the BOM size to the
   1053      * offsets because the actual converter function will not see and count the BOM.
   1054      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
   1055      */
   1056     offsetDelta=0;
   1057 
   1058     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
   1059         switch(state) {
   1060         case 0:
   1061             b=*source;
   1062             if(b==0) {
   1063                 state=1; /* could be 00 00 FE FF */
   1064             } else if(b==(char)0xff) {
   1065                 state=5; /* could be FF FE 00 00 */
   1066             } else {
   1067                 state=8; /* default to UTF-32BE */
   1068                 continue;
   1069             }
   1070             ++source;
   1071             break;
   1072         case 1:
   1073         case 2:
   1074         case 3:
   1075         case 5:
   1076         case 6:
   1077         case 7:
   1078             if(*source==utf32BOM[state]) {
   1079                 ++state;
   1080                 ++source;
   1081                 if(state==4) {
   1082                     state=8; /* detect UTF-32BE */
   1083                     offsetDelta=(int32_t)(source-pArgs->source);
   1084                 } else if(state==8) {
   1085                     state=9; /* detect UTF-32LE */
   1086                     offsetDelta=(int32_t)(source-pArgs->source);
   1087                 }
   1088             } else {
   1089                 /* switch to UTF-32BE and pass the previous bytes */
   1090                 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
   1091 
   1092                 /* reset the source */
   1093                 source=pArgs->source;
   1094 
   1095                 if(count==(state&3)) {
   1096                     /* simple: all in the same buffer, just reset source */
   1097                 } else {
   1098                     UBool oldFlush=pArgs->flush;
   1099 
   1100                     /* some of the bytes are from a previous buffer, replay those first */
   1101                     pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
   1102                     pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
   1103                     pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
   1104 
   1105                     /* no offsets: bytes from previous buffer, and not enough for output */
   1106                     T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
   1107 
   1108                     /* restore real pointers; pArgs->source will be set in case 8/9 */
   1109                     pArgs->sourceLimit=sourceLimit;
   1110                     pArgs->flush=oldFlush;
   1111                 }
   1112                 state=8;
   1113                 continue;
   1114             }
   1115             break;
   1116         case 8:
   1117             /* call UTF-32BE */
   1118             pArgs->source=source;
   1119             if(offsets==NULL) {
   1120                 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
   1121             } else {
   1122                 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
   1123             }
   1124             source=pArgs->source;
   1125             break;
   1126         case 9:
   1127             /* call UTF-32LE */
   1128             pArgs->source=source;
   1129             if(offsets==NULL) {
   1130                 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
   1131             } else {
   1132                 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
   1133             }
   1134             source=pArgs->source;
   1135             break;
   1136         default:
   1137             break; /* does not occur */
   1138         }
   1139     }
   1140 
   1141     /* add BOM size to offsets - see comment at offsetDelta declaration */
   1142     if(offsets!=NULL && offsetDelta!=0) {
   1143         int32_t *offsetsLimit=pArgs->offsets;
   1144         while(offsets<offsetsLimit) {
   1145             *offsets++ += offsetDelta;
   1146         }
   1147     }
   1148 
   1149     pArgs->source=source;
   1150 
   1151     if(source==sourceLimit && pArgs->flush) {
   1152         /* handle truncated input */
   1153         switch(state) {
   1154         case 0:
   1155             break; /* no input at all, nothing to do */
   1156         case 8:
   1157             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
   1158             break;
   1159         case 9:
   1160             T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
   1161             break;
   1162         default:
   1163             /* handle 0<state<8: call UTF-32BE with too-short input */
   1164             pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
   1165             pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
   1166 
   1167             /* no offsets: not enough for output */
   1168             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
   1169             pArgs->source=source;
   1170             pArgs->sourceLimit=sourceLimit;
   1171             state=8;
   1172             break;
   1173         }
   1174     }
   1175 
   1176     cnv->mode=state;
   1177 }
   1178 
   1179 static UChar32
   1180 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
   1181                    UErrorCode *pErrorCode) {
   1182     switch(pArgs->converter->mode) {
   1183     case 8:
   1184         return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
   1185     case 9:
   1186         return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
   1187     default:
   1188         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   1189     }
   1190 }
   1191 
   1192 static const UConverterImpl _UTF32Impl = {
   1193     UCNV_UTF32,
   1194 
   1195     NULL,
   1196     NULL,
   1197 
   1198     _UTF32Open,
   1199     NULL,
   1200     _UTF32Reset,
   1201 
   1202     _UTF32ToUnicodeWithOffsets,
   1203     _UTF32ToUnicodeWithOffsets,
   1204 #if U_IS_BIG_ENDIAN
   1205     T_UConverter_fromUnicode_UTF32_BE,
   1206     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
   1207 #else
   1208     T_UConverter_fromUnicode_UTF32_LE,
   1209     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
   1210 #endif
   1211     _UTF32GetNextUChar,
   1212 
   1213     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
   1214     NULL,
   1215     NULL,
   1216     NULL,
   1217     ucnv_getNonSurrogateUnicodeSet
   1218 };
   1219 
   1220 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
   1221 static const UConverterStaticData _UTF32StaticData = {
   1222     sizeof(UConverterStaticData),
   1223     "UTF-32",
   1224     1236,
   1225     UCNV_IBM, UCNV_UTF32, 4, 4,
   1226 #if U_IS_BIG_ENDIAN
   1227     { 0, 0, 0xff, 0xfd }, 4,
   1228 #else
   1229     { 0xfd, 0xff, 0, 0 }, 4,
   1230 #endif
   1231     FALSE, FALSE,
   1232     0,
   1233     0,
   1234     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1235 };
   1236 
   1237 const UConverterSharedData _UTF32Data =
   1238         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32StaticData, &_UTF32Impl);
   1239 
   1240 #endif
   1241