Home | History | Annotate | Download | only in common
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2002-2011, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   file name:  ucnv_u32.c
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2002jul01
     12 *   created by: Markus W. Scherer
     13 *
     14 *   UTF-32 converter implementation. Used to be in ucnv_utf.c.
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_CONVERSION
     20 
     21 #include "unicode/ucnv.h"
     22 #include "unicode/utf.h"
     23 #include "ucnv_bld.h"
     24 #include "ucnv_cnv.h"
     25 #include "cmemory.h"
     26 
     27 #define MAXIMUM_UCS2            0x0000FFFF
     28 #define MAXIMUM_UTF             0x0010FFFF
     29 #define HALF_SHIFT              10
     30 #define HALF_BASE               0x0010000
     31 #define HALF_MASK               0x3FF
     32 #define SURROGATE_HIGH_START    0xD800
     33 #define SURROGATE_LOW_START     0xDC00
     34 
     35 /* -SURROGATE_LOW_START + HALF_BASE */
     36 #define SURROGATE_LOW_BASE      9216
     37 
     38 enum {
     39     UCNV_NEED_TO_WRITE_BOM=1
     40 };
     41 
     42 /* UTF-32BE ----------------------------------------------------------------- */
     43 
     44 static void
     45 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
     46                                 UErrorCode * err)
     47 {
     48     const unsigned char *mySource = (unsigned char *) args->source;
     49     UChar *myTarget = args->target;
     50     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
     51     const UChar *targetLimit = args->targetLimit;
     52     unsigned char *toUBytes = args->converter->toUBytes;
     53     uint32_t ch, i;
     54 
     55     /* Restore state of current sequence */
     56     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
     57         i = args->converter->toULength;       /* restore # of bytes consumed */
     58         args->converter->toULength = 0;
     59 
     60         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
     61         args->converter->toUnicodeStatus = 0;
     62         goto morebytes;
     63     }
     64 
     65     while (mySource < sourceLimit && myTarget < targetLimit) {
     66         i = 0;
     67         ch = 0;
     68 morebytes:
     69         while (i < sizeof(uint32_t)) {
     70             if (mySource < sourceLimit) {
     71                 ch = (ch << 8) | (uint8_t)(*mySource);
     72                 toUBytes[i++] = (char) *(mySource++);
     73             }
     74             else {
     75                 /* stores a partially calculated target*/
     76                 /* + 1 to make 0 a valid character */
     77                 args->converter->toUnicodeStatus = ch + 1;
     78                 args->converter->toULength = (int8_t) i;
     79                 goto donefornow;
     80             }
     81         }
     82 
     83         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
     84             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
     85             if (ch <= MAXIMUM_UCS2)
     86             {
     87                 /* fits in 16 bits */
     88                 *(myTarget++) = (UChar) ch;
     89             }
     90             else {
     91                 /* write out the surrogates */
     92                 *(myTarget++) = U16_LEAD(ch);
     93                 ch = U16_TRAIL(ch);
     94                 if (myTarget < targetLimit) {
     95                     *(myTarget++) = (UChar)ch;
     96                 }
     97                 else {
     98                     /* Put in overflow buffer (not handled here) */
     99                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
    100                     args->converter->UCharErrorBufferLength = 1;
    101                     *err = U_BUFFER_OVERFLOW_ERROR;
    102                     break;
    103                 }
    104             }
    105         }
    106         else {
    107             args->converter->toULength = (int8_t)i;
    108             *err = U_ILLEGAL_CHAR_FOUND;
    109             break;
    110         }
    111     }
    112 
    113 donefornow:
    114     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
    115         /* End of target buffer */
    116         *err = U_BUFFER_OVERFLOW_ERROR;
    117     }
    118 
    119     args->target = myTarget;
    120     args->source = (const char *) mySource;
    121 }
    122 
    123 static void
    124 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
    125                                              UErrorCode * err)
    126 {
    127     const unsigned char *mySource = (unsigned char *) args->source;
    128     UChar *myTarget = args->target;
    129     int32_t *myOffsets = args->offsets;
    130     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    131     const UChar *targetLimit = args->targetLimit;
    132     unsigned char *toUBytes = args->converter->toUBytes;
    133     uint32_t ch, i;
    134     int32_t offsetNum = 0;
    135 
    136     /* Restore state of current sequence */
    137     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
    138         i = args->converter->toULength;       /* restore # of bytes consumed */
    139         args->converter->toULength = 0;
    140 
    141         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
    142         args->converter->toUnicodeStatus = 0;
    143         goto morebytes;
    144     }
    145 
    146     while (mySource < sourceLimit && myTarget < targetLimit) {
    147         i = 0;
    148         ch = 0;
    149 morebytes:
    150         while (i < sizeof(uint32_t)) {
    151             if (mySource < sourceLimit) {
    152                 ch = (ch << 8) | (uint8_t)(*mySource);
    153                 toUBytes[i++] = (char) *(mySource++);
    154             }
    155             else {
    156                 /* stores a partially calculated target*/
    157                 /* + 1 to make 0 a valid character */
    158                 args->converter->toUnicodeStatus = ch + 1;
    159                 args->converter->toULength = (int8_t) i;
    160                 goto donefornow;
    161             }
    162         }
    163 
    164         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
    165             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    166             if (ch <= MAXIMUM_UCS2) {
    167                 /* fits in 16 bits */
    168                 *(myTarget++) = (UChar) ch;
    169                 *(myOffsets++) = offsetNum;
    170             }
    171             else {
    172                 /* write out the surrogates */
    173                 *(myTarget++) = U16_LEAD(ch);
    174                 *myOffsets++ = offsetNum;
    175                 ch = U16_TRAIL(ch);
    176                 if (myTarget < targetLimit)
    177                 {
    178                     *(myTarget++) = (UChar)ch;
    179                     *(myOffsets++) = offsetNum;
    180                 }
    181                 else {
    182                     /* Put in overflow buffer (not handled here) */
    183                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
    184                     args->converter->UCharErrorBufferLength = 1;
    185                     *err = U_BUFFER_OVERFLOW_ERROR;
    186                     break;
    187                 }
    188             }
    189         }
    190         else {
    191             args->converter->toULength = (int8_t)i;
    192             *err = U_ILLEGAL_CHAR_FOUND;
    193             break;
    194         }
    195         offsetNum += i;
    196     }
    197 
    198 donefornow:
    199     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    200     {
    201         /* End of target buffer */
    202         *err = U_BUFFER_OVERFLOW_ERROR;
    203     }
    204 
    205     args->target = myTarget;
    206     args->source = (const char *) mySource;
    207     args->offsets = myOffsets;
    208 }
    209 
    210 static void
    211 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
    212                                   UErrorCode * err)
    213 {
    214     const UChar *mySource = args->source;
    215     unsigned char *myTarget;
    216     const UChar *sourceLimit = args->sourceLimit;
    217     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    218     UChar32 ch, ch2;
    219     unsigned int indexToWrite;
    220     unsigned char temp[sizeof(uint32_t)];
    221 
    222     if(mySource >= sourceLimit) {
    223         /* no input, nothing to do */
    224         return;
    225     }
    226 
    227     /* write the BOM if necessary */
    228     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    229         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
    230         ucnv_fromUWriteBytes(args->converter,
    231                              bom, 4,
    232                              &args->target, args->targetLimit,
    233                              &args->offsets, -1,
    234                              err);
    235         args->converter->fromUnicodeStatus=0;
    236     }
    237 
    238     myTarget = (unsigned char *) args->target;
    239     temp[0] = 0;
    240 
    241     if (args->converter->fromUChar32) {
    242         ch = args->converter->fromUChar32;
    243         args->converter->fromUChar32 = 0;
    244         goto lowsurogate;
    245     }
    246 
    247     while (mySource < sourceLimit && myTarget < targetLimit) {
    248         ch = *(mySource++);
    249 
    250         if (U_IS_SURROGATE(ch)) {
    251             if (U_IS_LEAD(ch)) {
    252 lowsurogate:
    253                 if (mySource < sourceLimit) {
    254                     ch2 = *mySource;
    255                     if (U_IS_TRAIL(ch2)) {
    256                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
    257                         mySource++;
    258                     }
    259                     else {
    260                         /* this is an unmatched trail code unit (2nd surrogate) */
    261                         /* callback(illegal) */
    262                         args->converter->fromUChar32 = ch;
    263                         *err = U_ILLEGAL_CHAR_FOUND;
    264                         break;
    265                     }
    266                 }
    267                 else {
    268                     /* ran out of source */
    269                     args->converter->fromUChar32 = ch;
    270                     if (args->flush) {
    271                         /* this is an unmatched trail code unit (2nd surrogate) */
    272                         /* callback(illegal) */
    273                         *err = U_ILLEGAL_CHAR_FOUND;
    274                     }
    275                     break;
    276                 }
    277             }
    278             else {
    279                 /* this is an unmatched trail code unit (2nd surrogate) */
    280                 /* callback(illegal) */
    281                 args->converter->fromUChar32 = ch;
    282                 *err = U_ILLEGAL_CHAR_FOUND;
    283                 break;
    284             }
    285         }
    286 
    287         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    288         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
    289         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
    290         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
    291 
    292         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
    293             if (myTarget < targetLimit) {
    294                 *(myTarget++) = temp[indexToWrite];
    295             }
    296             else {
    297                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
    298                 *err = U_BUFFER_OVERFLOW_ERROR;
    299             }
    300         }
    301     }
    302 
    303     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
    304         *err = U_BUFFER_OVERFLOW_ERROR;
    305     }
    306 
    307     args->target = (char *) myTarget;
    308     args->source = mySource;
    309 }
    310 
    311 static void
    312 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
    313                                                UErrorCode * err)
    314 {
    315     const UChar *mySource = args->source;
    316     unsigned char *myTarget;
    317     int32_t *myOffsets;
    318     const UChar *sourceLimit = args->sourceLimit;
    319     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    320     UChar32 ch, ch2;
    321     int32_t offsetNum = 0;
    322     unsigned int indexToWrite;
    323     unsigned char temp[sizeof(uint32_t)];
    324 
    325     if(mySource >= sourceLimit) {
    326         /* no input, nothing to do */
    327         return;
    328     }
    329 
    330     /* write the BOM if necessary */
    331     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    332         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
    333         ucnv_fromUWriteBytes(args->converter,
    334                              bom, 4,
    335                              &args->target, args->targetLimit,
    336                              &args->offsets, -1,
    337                              err);
    338         args->converter->fromUnicodeStatus=0;
    339     }
    340 
    341     myTarget = (unsigned char *) args->target;
    342     myOffsets = args->offsets;
    343     temp[0] = 0;
    344 
    345     if (args->converter->fromUChar32) {
    346         ch = args->converter->fromUChar32;
    347         args->converter->fromUChar32 = 0;
    348         goto lowsurogate;
    349     }
    350 
    351     while (mySource < sourceLimit && myTarget < targetLimit) {
    352         ch = *(mySource++);
    353 
    354         if (U_IS_SURROGATE(ch)) {
    355             if (U_IS_LEAD(ch)) {
    356 lowsurogate:
    357                 if (mySource < sourceLimit) {
    358                     ch2 = *mySource;
    359                     if (U_IS_TRAIL(ch2)) {
    360                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
    361                         mySource++;
    362                     }
    363                     else {
    364                         /* this is an unmatched trail code unit (2nd surrogate) */
    365                         /* callback(illegal) */
    366                         args->converter->fromUChar32 = ch;
    367                         *err = U_ILLEGAL_CHAR_FOUND;
    368                         break;
    369                     }
    370                 }
    371                 else {
    372                     /* ran out of source */
    373                     args->converter->fromUChar32 = ch;
    374                     if (args->flush) {
    375                         /* this is an unmatched trail code unit (2nd surrogate) */
    376                         /* callback(illegal) */
    377                         *err = U_ILLEGAL_CHAR_FOUND;
    378                     }
    379                     break;
    380                 }
    381             }
    382             else {
    383                 /* this is an unmatched trail code unit (2nd surrogate) */
    384                 /* callback(illegal) */
    385                 args->converter->fromUChar32 = ch;
    386                 *err = U_ILLEGAL_CHAR_FOUND;
    387                 break;
    388             }
    389         }
    390 
    391         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    392         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
    393         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
    394         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
    395 
    396         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
    397             if (myTarget < targetLimit) {
    398                 *(myTarget++) = temp[indexToWrite];
    399                 *(myOffsets++) = offsetNum;
    400             }
    401             else {
    402                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
    403                 *err = U_BUFFER_OVERFLOW_ERROR;
    404             }
    405         }
    406         offsetNum = offsetNum + 1 + (temp[1] != 0);
    407     }
    408 
    409     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
    410         *err = U_BUFFER_OVERFLOW_ERROR;
    411     }
    412 
    413     args->target = (char *) myTarget;
    414     args->source = mySource;
    415     args->offsets = myOffsets;
    416 }
    417 
    418 static UChar32
    419 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
    420                                    UErrorCode* err)
    421 {
    422     const uint8_t *mySource;
    423     UChar32 myUChar;
    424     int32_t length;
    425 
    426     mySource = (const uint8_t *)args->source;
    427     if (mySource >= (const uint8_t *)args->sourceLimit)
    428     {
    429         /* no input */
    430         *err = U_INDEX_OUTOFBOUNDS_ERROR;
    431         return 0xffff;
    432     }
    433 
    434     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
    435     if (length < 4)
    436     {
    437         /* got a partial character */
    438         uprv_memcpy(args->converter->toUBytes, mySource, length);
    439         args->converter->toULength = (int8_t)length;
    440         args->source = (const char *)(mySource + length);
    441         *err = U_TRUNCATED_CHAR_FOUND;
    442         return 0xffff;
    443     }
    444 
    445     /* Don't even try to do a direct cast because the value may be on an odd address. */
    446     myUChar = ((UChar32)mySource[0] << 24)
    447             | ((UChar32)mySource[1] << 16)
    448             | ((UChar32)mySource[2] << 8)
    449             | ((UChar32)mySource[3]);
    450 
    451     args->source = (const char *)(mySource + 4);
    452     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
    453         return myUChar;
    454     }
    455 
    456     uprv_memcpy(args->converter->toUBytes, mySource, 4);
    457     args->converter->toULength = 4;
    458 
    459     *err = U_ILLEGAL_CHAR_FOUND;
    460     return 0xffff;
    461 }
    462 
    463 static const UConverterImpl _UTF32BEImpl = {
    464     UCNV_UTF32_BigEndian,
    465 
    466     NULL,
    467     NULL,
    468 
    469     NULL,
    470     NULL,
    471     NULL,
    472 
    473     T_UConverter_toUnicode_UTF32_BE,
    474     T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
    475     T_UConverter_fromUnicode_UTF32_BE,
    476     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
    477     T_UConverter_getNextUChar_UTF32_BE,
    478 
    479     NULL,
    480     NULL,
    481     NULL,
    482     NULL,
    483     ucnv_getNonSurrogateUnicodeSet
    484 };
    485 
    486 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
    487 static const UConverterStaticData _UTF32BEStaticData = {
    488     sizeof(UConverterStaticData),
    489     "UTF-32BE",
    490     1232,
    491     UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
    492     { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
    493     0,
    494     0,
    495     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
    496 };
    497 
    498 const UConverterSharedData _UTF32BEData = {
    499     sizeof(UConverterSharedData), ~((uint32_t) 0),
    500     NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,
    501     0
    502 };
    503 
    504 /* UTF-32LE ---------------------------------------------------------- */
    505 
    506 static void
    507 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
    508                                 UErrorCode * err)
    509 {
    510     const unsigned char *mySource = (unsigned char *) args->source;
    511     UChar *myTarget = args->target;
    512     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    513     const UChar *targetLimit = args->targetLimit;
    514     unsigned char *toUBytes = args->converter->toUBytes;
    515     uint32_t ch, i;
    516 
    517     /* Restore state of current sequence */
    518     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
    519     {
    520         i = args->converter->toULength;       /* restore # of bytes consumed */
    521         args->converter->toULength = 0;
    522 
    523         /* Stores the previously calculated ch from a previous call*/
    524         ch = args->converter->toUnicodeStatus - 1;
    525         args->converter->toUnicodeStatus = 0;
    526         goto morebytes;
    527     }
    528 
    529     while (mySource < sourceLimit && myTarget < targetLimit)
    530     {
    531         i = 0;
    532         ch = 0;
    533 morebytes:
    534         while (i < sizeof(uint32_t))
    535         {
    536             if (mySource < sourceLimit)
    537             {
    538                 ch |= ((uint8_t)(*mySource)) << (i * 8);
    539                 toUBytes[i++] = (char) *(mySource++);
    540             }
    541             else
    542             {
    543                 /* stores a partially calculated target*/
    544                 /* + 1 to make 0 a valid character */
    545                 args->converter->toUnicodeStatus = ch + 1;
    546                 args->converter->toULength = (int8_t) i;
    547                 goto donefornow;
    548             }
    549         }
    550 
    551         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
    552             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    553             if (ch <= MAXIMUM_UCS2) {
    554                 /* fits in 16 bits */
    555                 *(myTarget++) = (UChar) ch;
    556             }
    557             else {
    558                 /* write out the surrogates */
    559                 *(myTarget++) = U16_LEAD(ch);
    560                 ch = U16_TRAIL(ch);
    561                 if (myTarget < targetLimit) {
    562                     *(myTarget++) = (UChar)ch;
    563                 }
    564                 else {
    565                     /* Put in overflow buffer (not handled here) */
    566                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
    567                     args->converter->UCharErrorBufferLength = 1;
    568                     *err = U_BUFFER_OVERFLOW_ERROR;
    569                     break;
    570                 }
    571             }
    572         }
    573         else {
    574             args->converter->toULength = (int8_t)i;
    575             *err = U_ILLEGAL_CHAR_FOUND;
    576             break;
    577         }
    578     }
    579 
    580 donefornow:
    581     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    582     {
    583         /* End of target buffer */
    584         *err = U_BUFFER_OVERFLOW_ERROR;
    585     }
    586 
    587     args->target = myTarget;
    588     args->source = (const char *) mySource;
    589 }
    590 
    591 static void
    592 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
    593                                              UErrorCode * err)
    594 {
    595     const unsigned char *mySource = (unsigned char *) args->source;
    596     UChar *myTarget = args->target;
    597     int32_t *myOffsets = args->offsets;
    598     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    599     const UChar *targetLimit = args->targetLimit;
    600     unsigned char *toUBytes = args->converter->toUBytes;
    601     uint32_t ch, i;
    602     int32_t offsetNum = 0;
    603 
    604     /* Restore state of current sequence */
    605     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
    606     {
    607         i = args->converter->toULength;       /* restore # of bytes consumed */
    608         args->converter->toULength = 0;
    609 
    610         /* Stores the previously calculated ch from a previous call*/
    611         ch = args->converter->toUnicodeStatus - 1;
    612         args->converter->toUnicodeStatus = 0;
    613         goto morebytes;
    614     }
    615 
    616     while (mySource < sourceLimit && myTarget < targetLimit)
    617     {
    618         i = 0;
    619         ch = 0;
    620 morebytes:
    621         while (i < sizeof(uint32_t))
    622         {
    623             if (mySource < sourceLimit)
    624             {
    625                 ch |= ((uint8_t)(*mySource)) << (i * 8);
    626                 toUBytes[i++] = (char) *(mySource++);
    627             }
    628             else
    629             {
    630                 /* stores a partially calculated target*/
    631                 /* + 1 to make 0 a valid character */
    632                 args->converter->toUnicodeStatus = ch + 1;
    633                 args->converter->toULength = (int8_t) i;
    634                 goto donefornow;
    635             }
    636         }
    637 
    638         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
    639         {
    640             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    641             if (ch <= MAXIMUM_UCS2)
    642             {
    643                 /* fits in 16 bits */
    644                 *(myTarget++) = (UChar) ch;
    645                 *(myOffsets++) = offsetNum;
    646             }
    647             else {
    648                 /* write out the surrogates */
    649                 *(myTarget++) = U16_LEAD(ch);
    650                 *(myOffsets++) = offsetNum;
    651                 ch = U16_TRAIL(ch);
    652                 if (myTarget < targetLimit)
    653                 {
    654                     *(myTarget++) = (UChar)ch;
    655                     *(myOffsets++) = offsetNum;
    656                 }
    657                 else
    658                 {
    659                     /* Put in overflow buffer (not handled here) */
    660                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
    661                     args->converter->UCharErrorBufferLength = 1;
    662                     *err = U_BUFFER_OVERFLOW_ERROR;
    663                     break;
    664                 }
    665             }
    666         }
    667         else
    668         {
    669             args->converter->toULength = (int8_t)i;
    670             *err = U_ILLEGAL_CHAR_FOUND;
    671             break;
    672         }
    673         offsetNum += i;
    674     }
    675 
    676 donefornow:
    677     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    678     {
    679         /* End of target buffer */
    680         *err = U_BUFFER_OVERFLOW_ERROR;
    681     }
    682 
    683     args->target = myTarget;
    684     args->source = (const char *) mySource;
    685     args->offsets = myOffsets;
    686 }
    687 
    688 static void
    689 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
    690                                   UErrorCode * err)
    691 {
    692     const UChar *mySource = args->source;
    693     unsigned char *myTarget;
    694     const UChar *sourceLimit = args->sourceLimit;
    695     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    696     UChar32 ch, ch2;
    697     unsigned int indexToWrite;
    698     unsigned char temp[sizeof(uint32_t)];
    699 
    700     if(mySource >= sourceLimit) {
    701         /* no input, nothing to do */
    702         return;
    703     }
    704 
    705     /* write the BOM if necessary */
    706     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    707         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
    708         ucnv_fromUWriteBytes(args->converter,
    709                              bom, 4,
    710                              &args->target, args->targetLimit,
    711                              &args->offsets, -1,
    712                              err);
    713         args->converter->fromUnicodeStatus=0;
    714     }
    715 
    716     myTarget = (unsigned char *) args->target;
    717     temp[3] = 0;
    718 
    719     if (args->converter->fromUChar32)
    720     {
    721         ch = args->converter->fromUChar32;
    722         args->converter->fromUChar32 = 0;
    723         goto lowsurogate;
    724     }
    725 
    726     while (mySource < sourceLimit && myTarget < targetLimit)
    727     {
    728         ch = *(mySource++);
    729 
    730         if (U16_IS_SURROGATE(ch)) {
    731             if (U16_IS_LEAD(ch))
    732             {
    733 lowsurogate:
    734                 if (mySource < sourceLimit)
    735                 {
    736                     ch2 = *mySource;
    737                     if (U16_IS_TRAIL(ch2)) {
    738                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
    739                         mySource++;
    740                     }
    741                     else {
    742                         /* this is an unmatched trail code unit (2nd surrogate) */
    743                         /* callback(illegal) */
    744                         args->converter->fromUChar32 = ch;
    745                         *err = U_ILLEGAL_CHAR_FOUND;
    746                         break;
    747                     }
    748                 }
    749                 else {
    750                     /* ran out of source */
    751                     args->converter->fromUChar32 = ch;
    752                     if (args->flush) {
    753                         /* this is an unmatched trail code unit (2nd surrogate) */
    754                         /* callback(illegal) */
    755                         *err = U_ILLEGAL_CHAR_FOUND;
    756                     }
    757                     break;
    758                 }
    759             }
    760             else {
    761                 /* this is an unmatched trail code unit (2nd surrogate) */
    762                 /* callback(illegal) */
    763                 args->converter->fromUChar32 = ch;
    764                 *err = U_ILLEGAL_CHAR_FOUND;
    765                 break;
    766             }
    767         }
    768 
    769         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    770         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
    771         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
    772         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
    773 
    774         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
    775         {
    776             if (myTarget < targetLimit)
    777             {
    778                 *(myTarget++) = temp[indexToWrite];
    779             }
    780             else
    781             {
    782                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
    783                 *err = U_BUFFER_OVERFLOW_ERROR;
    784             }
    785         }
    786     }
    787 
    788     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    789     {
    790         *err = U_BUFFER_OVERFLOW_ERROR;
    791     }
    792 
    793     args->target = (char *) myTarget;
    794     args->source = mySource;
    795 }
    796 
    797 static void
    798 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
    799                                                UErrorCode * err)
    800 {
    801     const UChar *mySource = args->source;
    802     unsigned char *myTarget;
    803     int32_t *myOffsets;
    804     const UChar *sourceLimit = args->sourceLimit;
    805     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    806     UChar32 ch, ch2;
    807     unsigned int indexToWrite;
    808     unsigned char temp[sizeof(uint32_t)];
    809     int32_t offsetNum = 0;
    810 
    811     if(mySource >= sourceLimit) {
    812         /* no input, nothing to do */
    813         return;
    814     }
    815 
    816     /* write the BOM if necessary */
    817     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    818         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
    819         ucnv_fromUWriteBytes(args->converter,
    820                              bom, 4,
    821                              &args->target, args->targetLimit,
    822                              &args->offsets, -1,
    823                              err);
    824         args->converter->fromUnicodeStatus=0;
    825     }
    826 
    827     myTarget = (unsigned char *) args->target;
    828     myOffsets = args->offsets;
    829     temp[3] = 0;
    830 
    831     if (args->converter->fromUChar32)
    832     {
    833         ch = args->converter->fromUChar32;
    834         args->converter->fromUChar32 = 0;
    835         goto lowsurogate;
    836     }
    837 
    838     while (mySource < sourceLimit && myTarget < targetLimit)
    839     {
    840         ch = *(mySource++);
    841 
    842         if (U16_IS_SURROGATE(ch)) {
    843             if (U16_IS_LEAD(ch))
    844             {
    845 lowsurogate:
    846                 if (mySource < sourceLimit)
    847                 {
    848                     ch2 = *mySource;
    849                     if (U16_IS_TRAIL(ch2))
    850                     {
    851                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
    852                         mySource++;
    853                     }
    854                     else {
    855                         /* this is an unmatched trail code unit (2nd surrogate) */
    856                         /* callback(illegal) */
    857                         args->converter->fromUChar32 = ch;
    858                         *err = U_ILLEGAL_CHAR_FOUND;
    859                         break;
    860                     }
    861                 }
    862                 else {
    863                     /* ran out of source */
    864                     args->converter->fromUChar32 = ch;
    865                     if (args->flush) {
    866                         /* this is an unmatched trail code unit (2nd surrogate) */
    867                         /* callback(illegal) */
    868                         *err = U_ILLEGAL_CHAR_FOUND;
    869                     }
    870                     break;
    871                 }
    872             }
    873             else {
    874                 /* this is an unmatched trail code unit (2nd surrogate) */
    875                 /* callback(illegal) */
    876                 args->converter->fromUChar32 = ch;
    877                 *err = U_ILLEGAL_CHAR_FOUND;
    878                 break;
    879             }
    880         }
    881 
    882         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    883         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
    884         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
    885         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
    886 
    887         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
    888         {
    889             if (myTarget < targetLimit)
    890             {
    891                 *(myTarget++) = temp[indexToWrite];
    892                 *(myOffsets++) = offsetNum;
    893             }
    894             else
    895             {
    896                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
    897                 *err = U_BUFFER_OVERFLOW_ERROR;
    898             }
    899         }
    900         offsetNum = offsetNum + 1 + (temp[2] != 0);
    901     }
    902 
    903     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    904     {
    905         *err = U_BUFFER_OVERFLOW_ERROR;
    906     }
    907 
    908     args->target = (char *) myTarget;
    909     args->source = mySource;
    910     args->offsets = myOffsets;
    911 }
    912 
    913 static UChar32
    914 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
    915                                    UErrorCode* err)
    916 {
    917     const uint8_t *mySource;
    918     UChar32 myUChar;
    919     int32_t length;
    920 
    921     mySource = (const uint8_t *)args->source;
    922     if (mySource >= (const uint8_t *)args->sourceLimit)
    923     {
    924         /* no input */
    925         *err = U_INDEX_OUTOFBOUNDS_ERROR;
    926         return 0xffff;
    927     }
    928 
    929     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
    930     if (length < 4)
    931     {
    932         /* got a partial character */
    933         uprv_memcpy(args->converter->toUBytes, mySource, length);
    934         args->converter->toULength = (int8_t)length;
    935         args->source = (const char *)(mySource + length);
    936         *err = U_TRUNCATED_CHAR_FOUND;
    937         return 0xffff;
    938     }
    939 
    940     /* Don't even try to do a direct cast because the value may be on an odd address. */
    941     myUChar = ((UChar32)mySource[3] << 24)
    942             | ((UChar32)mySource[2] << 16)
    943             | ((UChar32)mySource[1] << 8)
    944             | ((UChar32)mySource[0]);
    945 
    946     args->source = (const char *)(mySource + 4);
    947     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
    948         return myUChar;
    949     }
    950 
    951     uprv_memcpy(args->converter->toUBytes, mySource, 4);
    952     args->converter->toULength = 4;
    953 
    954     *err = U_ILLEGAL_CHAR_FOUND;
    955     return 0xffff;
    956 }
    957 
    958 static const UConverterImpl _UTF32LEImpl = {
    959     UCNV_UTF32_LittleEndian,
    960 
    961     NULL,
    962     NULL,
    963 
    964     NULL,
    965     NULL,
    966     NULL,
    967 
    968     T_UConverter_toUnicode_UTF32_LE,
    969     T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
    970     T_UConverter_fromUnicode_UTF32_LE,
    971     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
    972     T_UConverter_getNextUChar_UTF32_LE,
    973 
    974     NULL,
    975     NULL,
    976     NULL,
    977     NULL,
    978     ucnv_getNonSurrogateUnicodeSet
    979 };
    980 
    981 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
    982 static const UConverterStaticData _UTF32LEStaticData = {
    983     sizeof(UConverterStaticData),
    984     "UTF-32LE",
    985     1234,
    986     UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
    987     { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
    988     0,
    989     0,
    990     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
    991 };
    992 
    993 
    994 const UConverterSharedData _UTF32LEData = {
    995     sizeof(UConverterSharedData), ~((uint32_t) 0),
    996     NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,
    997     0
    998 };
    999 
   1000 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
   1001 
   1002 /*
   1003  * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
   1004  * accordingly.
   1005  *
   1006  * State values:
   1007  * 0    initial state
   1008  * 1    saw 00
   1009  * 2    saw 00 00
   1010  * 3    saw 00 00 FE
   1011  * 4    -
   1012  * 5    saw FF
   1013  * 6    saw FF FE
   1014  * 7    saw FF FE 00
   1015  * 8    UTF-32BE mode
   1016  * 9    UTF-32LE mode
   1017  *
   1018  * During detection: state&3==number of matching bytes so far.
   1019  *
   1020  * On output, emit U+FEFF as the first code point.
   1021  */
   1022 
   1023 static void
   1024 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
   1025     if(choice<=UCNV_RESET_TO_UNICODE) {
   1026         /* reset toUnicode: state=0 */
   1027         cnv->mode=0;
   1028     }
   1029     if(choice!=UCNV_RESET_TO_UNICODE) {
   1030         /* reset fromUnicode: prepare to output the UTF-32PE BOM */
   1031         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
   1032     }
   1033 }
   1034 
   1035 static void
   1036 _UTF32Open(UConverter *cnv,
   1037            UConverterLoadArgs *pArgs,
   1038            UErrorCode *pErrorCode) {
   1039     _UTF32Reset(cnv, UCNV_RESET_BOTH);
   1040 }
   1041 
   1042 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff,    (char)0xff, (char)0xfe, 0, 0 };
   1043 
   1044 static void
   1045 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   1046                            UErrorCode *pErrorCode) {
   1047     UConverter *cnv=pArgs->converter;
   1048     const char *source=pArgs->source;
   1049     const char *sourceLimit=pArgs->sourceLimit;
   1050     int32_t *offsets=pArgs->offsets;
   1051 
   1052     int32_t state, offsetDelta;
   1053     char b;
   1054 
   1055     state=cnv->mode;
   1056 
   1057     /*
   1058      * If we detect a BOM in this buffer, then we must add the BOM size to the
   1059      * offsets because the actual converter function will not see and count the BOM.
   1060      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
   1061      */
   1062     offsetDelta=0;
   1063 
   1064     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
   1065         switch(state) {
   1066         case 0:
   1067             b=*source;
   1068             if(b==0) {
   1069                 state=1; /* could be 00 00 FE FF */
   1070             } else if(b==(char)0xff) {
   1071                 state=5; /* could be FF FE 00 00 */
   1072             } else {
   1073                 state=8; /* default to UTF-32BE */
   1074                 continue;
   1075             }
   1076             ++source;
   1077             break;
   1078         case 1:
   1079         case 2:
   1080         case 3:
   1081         case 5:
   1082         case 6:
   1083         case 7:
   1084             if(*source==utf32BOM[state]) {
   1085                 ++state;
   1086                 ++source;
   1087                 if(state==4) {
   1088                     state=8; /* detect UTF-32BE */
   1089                     offsetDelta=(int32_t)(source-pArgs->source);
   1090                 } else if(state==8) {
   1091                     state=9; /* detect UTF-32LE */
   1092                     offsetDelta=(int32_t)(source-pArgs->source);
   1093                 }
   1094             } else {
   1095                 /* switch to UTF-32BE and pass the previous bytes */
   1096                 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
   1097 
   1098                 /* reset the source */
   1099                 source=pArgs->source;
   1100 
   1101                 if(count==(state&3)) {
   1102                     /* simple: all in the same buffer, just reset source */
   1103                 } else {
   1104                     UBool oldFlush=pArgs->flush;
   1105 
   1106                     /* some of the bytes are from a previous buffer, replay those first */
   1107                     pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
   1108                     pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
   1109                     pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
   1110 
   1111                     /* no offsets: bytes from previous buffer, and not enough for output */
   1112                     T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
   1113 
   1114                     /* restore real pointers; pArgs->source will be set in case 8/9 */
   1115                     pArgs->sourceLimit=sourceLimit;
   1116                     pArgs->flush=oldFlush;
   1117                 }
   1118                 state=8;
   1119                 continue;
   1120             }
   1121             break;
   1122         case 8:
   1123             /* call UTF-32BE */
   1124             pArgs->source=source;
   1125             if(offsets==NULL) {
   1126                 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
   1127             } else {
   1128                 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
   1129             }
   1130             source=pArgs->source;
   1131             break;
   1132         case 9:
   1133             /* call UTF-32LE */
   1134             pArgs->source=source;
   1135             if(offsets==NULL) {
   1136                 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
   1137             } else {
   1138                 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
   1139             }
   1140             source=pArgs->source;
   1141             break;
   1142         default:
   1143             break; /* does not occur */
   1144         }
   1145     }
   1146 
   1147     /* add BOM size to offsets - see comment at offsetDelta declaration */
   1148     if(offsets!=NULL && offsetDelta!=0) {
   1149         int32_t *offsetsLimit=pArgs->offsets;
   1150         while(offsets<offsetsLimit) {
   1151             *offsets++ += offsetDelta;
   1152         }
   1153     }
   1154 
   1155     pArgs->source=source;
   1156 
   1157     if(source==sourceLimit && pArgs->flush) {
   1158         /* handle truncated input */
   1159         switch(state) {
   1160         case 0:
   1161             break; /* no input at all, nothing to do */
   1162         case 8:
   1163             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
   1164             break;
   1165         case 9:
   1166             T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
   1167             break;
   1168         default:
   1169             /* handle 0<state<8: call UTF-32BE with too-short input */
   1170             pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
   1171             pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
   1172 
   1173             /* no offsets: not enough for output */
   1174             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
   1175             pArgs->source=source;
   1176             pArgs->sourceLimit=sourceLimit;
   1177             state=8;
   1178             break;
   1179         }
   1180     }
   1181 
   1182     cnv->mode=state;
   1183 }
   1184 
   1185 static UChar32
   1186 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
   1187                    UErrorCode *pErrorCode) {
   1188     switch(pArgs->converter->mode) {
   1189     case 8:
   1190         return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
   1191     case 9:
   1192         return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
   1193     default:
   1194         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   1195     }
   1196 }
   1197 
   1198 static const UConverterImpl _UTF32Impl = {
   1199     UCNV_UTF32,
   1200 
   1201     NULL,
   1202     NULL,
   1203 
   1204     _UTF32Open,
   1205     NULL,
   1206     _UTF32Reset,
   1207 
   1208     _UTF32ToUnicodeWithOffsets,
   1209     _UTF32ToUnicodeWithOffsets,
   1210 #if U_IS_BIG_ENDIAN
   1211     T_UConverter_fromUnicode_UTF32_BE,
   1212     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
   1213 #else
   1214     T_UConverter_fromUnicode_UTF32_LE,
   1215     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
   1216 #endif
   1217     _UTF32GetNextUChar,
   1218 
   1219     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
   1220     NULL,
   1221     NULL,
   1222     NULL,
   1223     ucnv_getNonSurrogateUnicodeSet
   1224 };
   1225 
   1226 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
   1227 static const UConverterStaticData _UTF32StaticData = {
   1228     sizeof(UConverterStaticData),
   1229     "UTF-32",
   1230     1236,
   1231     UCNV_IBM, UCNV_UTF32, 4, 4,
   1232 #if U_IS_BIG_ENDIAN
   1233     { 0, 0, 0xff, 0xfd }, 4,
   1234 #else
   1235     { 0xfd, 0xff, 0, 0 }, 4,
   1236 #endif
   1237     FALSE, FALSE,
   1238     0,
   1239     0,
   1240     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1241 };
   1242 
   1243 const UConverterSharedData _UTF32Data = {
   1244     sizeof(UConverterSharedData), ~((uint32_t) 0),
   1245     NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,
   1246     0
   1247 };
   1248 
   1249 #endif
   1250