Home | History | Annotate | Download | only in common
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2002-2009, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 *   file name:  ucnv_u32.c
      7 *   encoding:   US-ASCII
      8 *   tab size:   8 (not used)
      9 *   indentation:4
     10 *
     11 *   created on: 2002jul01
     12 *   created by: Markus W. Scherer
     13 *
     14 *   UTF-32 converter implementation. Used to be in ucnv_utf.c.
     15 */
     16 
     17 #include "unicode/utypes.h"
     18 
     19 #if !UCONFIG_NO_CONVERSION
     20 
     21 #include "unicode/ucnv.h"
     22 #include "ucnv_bld.h"
     23 #include "ucnv_cnv.h"
     24 #include "cmemory.h"
     25 
     26 #define MAXIMUM_UCS2            0x0000FFFF
     27 #define MAXIMUM_UTF             0x0010FFFF
     28 #define HALF_SHIFT              10
     29 #define HALF_BASE               0x0010000
     30 #define HALF_MASK               0x3FF
     31 #define SURROGATE_HIGH_START    0xD800
     32 #define SURROGATE_LOW_START     0xDC00
     33 
     34 /* -SURROGATE_LOW_START + HALF_BASE */
     35 #define SURROGATE_LOW_BASE      9216
     36 
     37 enum {
     38     UCNV_NEED_TO_WRITE_BOM=1
     39 };
     40 
     41 /* UTF-32BE ----------------------------------------------------------------- */
     42 
     43 static void
     44 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args,
     45                                 UErrorCode * err)
     46 {
     47     const unsigned char *mySource = (unsigned char *) args->source;
     48     UChar *myTarget = args->target;
     49     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
     50     const UChar *targetLimit = args->targetLimit;
     51     unsigned char *toUBytes = args->converter->toUBytes;
     52     uint32_t ch, i;
     53 
     54     /* Restore state of current sequence */
     55     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
     56         i = args->converter->toULength;       /* restore # of bytes consumed */
     57         args->converter->toULength = 0;
     58 
     59         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
     60         args->converter->toUnicodeStatus = 0;
     61         goto morebytes;
     62     }
     63 
     64     while (mySource < sourceLimit && myTarget < targetLimit) {
     65         i = 0;
     66         ch = 0;
     67 morebytes:
     68         while (i < sizeof(uint32_t)) {
     69             if (mySource < sourceLimit) {
     70                 ch = (ch << 8) | (uint8_t)(*mySource);
     71                 toUBytes[i++] = (char) *(mySource++);
     72             }
     73             else {
     74                 /* stores a partially calculated target*/
     75                 /* + 1 to make 0 a valid character */
     76                 args->converter->toUnicodeStatus = ch + 1;
     77                 args->converter->toULength = (int8_t) i;
     78                 goto donefornow;
     79             }
     80         }
     81 
     82         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
     83             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
     84             if (ch <= MAXIMUM_UCS2)
     85             {
     86                 /* fits in 16 bits */
     87                 *(myTarget++) = (UChar) ch;
     88             }
     89             else {
     90                 /* write out the surrogates */
     91                 *(myTarget++) = U16_LEAD(ch);
     92                 ch = U16_TRAIL(ch);
     93                 if (myTarget < targetLimit) {
     94                     *(myTarget++) = (UChar)ch;
     95                 }
     96                 else {
     97                     /* Put in overflow buffer (not handled here) */
     98                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
     99                     args->converter->UCharErrorBufferLength = 1;
    100                     *err = U_BUFFER_OVERFLOW_ERROR;
    101                     break;
    102                 }
    103             }
    104         }
    105         else {
    106             args->converter->toULength = (int8_t)i;
    107             *err = U_ILLEGAL_CHAR_FOUND;
    108             break;
    109         }
    110     }
    111 
    112 donefornow:
    113     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
    114         /* End of target buffer */
    115         *err = U_BUFFER_OVERFLOW_ERROR;
    116     }
    117 
    118     args->target = myTarget;
    119     args->source = (const char *) mySource;
    120 }
    121 
    122 static void
    123 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
    124                                              UErrorCode * err)
    125 {
    126     const unsigned char *mySource = (unsigned char *) args->source;
    127     UChar *myTarget = args->target;
    128     int32_t *myOffsets = args->offsets;
    129     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    130     const UChar *targetLimit = args->targetLimit;
    131     unsigned char *toUBytes = args->converter->toUBytes;
    132     uint32_t ch, i;
    133     int32_t offsetNum = 0;
    134 
    135     /* Restore state of current sequence */
    136     if (args->converter->toUnicodeStatus && myTarget < targetLimit) {
    137         i = args->converter->toULength;       /* restore # of bytes consumed */
    138         args->converter->toULength = 0;
    139 
    140         ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/
    141         args->converter->toUnicodeStatus = 0;
    142         goto morebytes;
    143     }
    144 
    145     while (mySource < sourceLimit && myTarget < targetLimit) {
    146         i = 0;
    147         ch = 0;
    148 morebytes:
    149         while (i < sizeof(uint32_t)) {
    150             if (mySource < sourceLimit) {
    151                 ch = (ch << 8) | (uint8_t)(*mySource);
    152                 toUBytes[i++] = (char) *(mySource++);
    153             }
    154             else {
    155                 /* stores a partially calculated target*/
    156                 /* + 1 to make 0 a valid character */
    157                 args->converter->toUnicodeStatus = ch + 1;
    158                 args->converter->toULength = (int8_t) i;
    159                 goto donefornow;
    160             }
    161         }
    162 
    163         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
    164             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    165             if (ch <= MAXIMUM_UCS2) {
    166                 /* fits in 16 bits */
    167                 *(myTarget++) = (UChar) ch;
    168                 *(myOffsets++) = offsetNum;
    169             }
    170             else {
    171                 /* write out the surrogates */
    172                 *(myTarget++) = U16_LEAD(ch);
    173                 *myOffsets++ = offsetNum;
    174                 ch = U16_TRAIL(ch);
    175                 if (myTarget < targetLimit)
    176                 {
    177                     *(myTarget++) = (UChar)ch;
    178                     *(myOffsets++) = offsetNum;
    179                 }
    180                 else {
    181                     /* Put in overflow buffer (not handled here) */
    182                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
    183                     args->converter->UCharErrorBufferLength = 1;
    184                     *err = U_BUFFER_OVERFLOW_ERROR;
    185                     break;
    186                 }
    187             }
    188         }
    189         else {
    190             args->converter->toULength = (int8_t)i;
    191             *err = U_ILLEGAL_CHAR_FOUND;
    192             break;
    193         }
    194         offsetNum += i;
    195     }
    196 
    197 donefornow:
    198     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    199     {
    200         /* End of target buffer */
    201         *err = U_BUFFER_OVERFLOW_ERROR;
    202     }
    203 
    204     args->target = myTarget;
    205     args->source = (const char *) mySource;
    206     args->offsets = myOffsets;
    207 }
    208 
    209 static void
    210 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args,
    211                                   UErrorCode * err)
    212 {
    213     const UChar *mySource = args->source;
    214     unsigned char *myTarget;
    215     const UChar *sourceLimit = args->sourceLimit;
    216     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    217     UChar32 ch, ch2;
    218     unsigned int indexToWrite;
    219     unsigned char temp[sizeof(uint32_t)];
    220 
    221     if(mySource >= sourceLimit) {
    222         /* no input, nothing to do */
    223         return;
    224     }
    225 
    226     /* write the BOM if necessary */
    227     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    228         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
    229         ucnv_fromUWriteBytes(args->converter,
    230                              bom, 4,
    231                              &args->target, args->targetLimit,
    232                              &args->offsets, -1,
    233                              err);
    234         args->converter->fromUnicodeStatus=0;
    235     }
    236 
    237     myTarget = (unsigned char *) args->target;
    238     temp[0] = 0;
    239 
    240     if (args->converter->fromUChar32) {
    241         ch = args->converter->fromUChar32;
    242         args->converter->fromUChar32 = 0;
    243         goto lowsurogate;
    244     }
    245 
    246     while (mySource < sourceLimit && myTarget < targetLimit) {
    247         ch = *(mySource++);
    248 
    249         if (UTF_IS_SURROGATE(ch)) {
    250             if (U_IS_LEAD(ch)) {
    251 lowsurogate:
    252                 if (mySource < sourceLimit) {
    253                     ch2 = *mySource;
    254                     if (U_IS_TRAIL(ch2)) {
    255                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
    256                         mySource++;
    257                     }
    258                     else {
    259                         /* this is an unmatched trail code unit (2nd surrogate) */
    260                         /* callback(illegal) */
    261                         args->converter->fromUChar32 = ch;
    262                         *err = U_ILLEGAL_CHAR_FOUND;
    263                         break;
    264                     }
    265                 }
    266                 else {
    267                     /* ran out of source */
    268                     args->converter->fromUChar32 = ch;
    269                     if (args->flush) {
    270                         /* this is an unmatched trail code unit (2nd surrogate) */
    271                         /* callback(illegal) */
    272                         *err = U_ILLEGAL_CHAR_FOUND;
    273                     }
    274                     break;
    275                 }
    276             }
    277             else {
    278                 /* this is an unmatched trail code unit (2nd surrogate) */
    279                 /* callback(illegal) */
    280                 args->converter->fromUChar32 = ch;
    281                 *err = U_ILLEGAL_CHAR_FOUND;
    282                 break;
    283             }
    284         }
    285 
    286         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    287         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
    288         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
    289         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
    290 
    291         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
    292             if (myTarget < targetLimit) {
    293                 *(myTarget++) = temp[indexToWrite];
    294             }
    295             else {
    296                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
    297                 *err = U_BUFFER_OVERFLOW_ERROR;
    298             }
    299         }
    300     }
    301 
    302     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
    303         *err = U_BUFFER_OVERFLOW_ERROR;
    304     }
    305 
    306     args->target = (char *) myTarget;
    307     args->source = mySource;
    308 }
    309 
    310 static void
    311 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
    312                                                UErrorCode * err)
    313 {
    314     const UChar *mySource = args->source;
    315     unsigned char *myTarget;
    316     int32_t *myOffsets;
    317     const UChar *sourceLimit = args->sourceLimit;
    318     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    319     UChar32 ch, ch2;
    320     int32_t offsetNum = 0;
    321     unsigned int indexToWrite;
    322     unsigned char temp[sizeof(uint32_t)];
    323 
    324     if(mySource >= sourceLimit) {
    325         /* no input, nothing to do */
    326         return;
    327     }
    328 
    329     /* write the BOM if necessary */
    330     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    331         static const char bom[]={ 0, 0, (char)0xfe, (char)0xff };
    332         ucnv_fromUWriteBytes(args->converter,
    333                              bom, 4,
    334                              &args->target, args->targetLimit,
    335                              &args->offsets, -1,
    336                              err);
    337         args->converter->fromUnicodeStatus=0;
    338     }
    339 
    340     myTarget = (unsigned char *) args->target;
    341     myOffsets = args->offsets;
    342     temp[0] = 0;
    343 
    344     if (args->converter->fromUChar32) {
    345         ch = args->converter->fromUChar32;
    346         args->converter->fromUChar32 = 0;
    347         goto lowsurogate;
    348     }
    349 
    350     while (mySource < sourceLimit && myTarget < targetLimit) {
    351         ch = *(mySource++);
    352 
    353         if (UTF_IS_SURROGATE(ch)) {
    354             if (U_IS_LEAD(ch)) {
    355 lowsurogate:
    356                 if (mySource < sourceLimit) {
    357                     ch2 = *mySource;
    358                     if (U_IS_TRAIL(ch2)) {
    359                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
    360                         mySource++;
    361                     }
    362                     else {
    363                         /* this is an unmatched trail code unit (2nd surrogate) */
    364                         /* callback(illegal) */
    365                         args->converter->fromUChar32 = ch;
    366                         *err = U_ILLEGAL_CHAR_FOUND;
    367                         break;
    368                     }
    369                 }
    370                 else {
    371                     /* ran out of source */
    372                     args->converter->fromUChar32 = ch;
    373                     if (args->flush) {
    374                         /* this is an unmatched trail code unit (2nd surrogate) */
    375                         /* callback(illegal) */
    376                         *err = U_ILLEGAL_CHAR_FOUND;
    377                     }
    378                     break;
    379                 }
    380             }
    381             else {
    382                 /* this is an unmatched trail code unit (2nd surrogate) */
    383                 /* callback(illegal) */
    384                 args->converter->fromUChar32 = ch;
    385                 *err = U_ILLEGAL_CHAR_FOUND;
    386                 break;
    387             }
    388         }
    389 
    390         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    391         temp[1] = (uint8_t) (ch >> 16 & 0x1F);
    392         temp[2] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
    393         temp[3] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
    394 
    395         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) {
    396             if (myTarget < targetLimit) {
    397                 *(myTarget++) = temp[indexToWrite];
    398                 *(myOffsets++) = offsetNum;
    399             }
    400             else {
    401                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
    402                 *err = U_BUFFER_OVERFLOW_ERROR;
    403             }
    404         }
    405         offsetNum = offsetNum + 1 + (temp[1] != 0);
    406     }
    407 
    408     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) {
    409         *err = U_BUFFER_OVERFLOW_ERROR;
    410     }
    411 
    412     args->target = (char *) myTarget;
    413     args->source = mySource;
    414     args->offsets = myOffsets;
    415 }
    416 
    417 static UChar32
    418 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args,
    419                                    UErrorCode* err)
    420 {
    421     const uint8_t *mySource;
    422     UChar32 myUChar;
    423     int32_t length;
    424 
    425     mySource = (const uint8_t *)args->source;
    426     if (mySource >= (const uint8_t *)args->sourceLimit)
    427     {
    428         /* no input */
    429         *err = U_INDEX_OUTOFBOUNDS_ERROR;
    430         return 0xffff;
    431     }
    432 
    433     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
    434     if (length < 4)
    435     {
    436         /* got a partial character */
    437         uprv_memcpy(args->converter->toUBytes, mySource, length);
    438         args->converter->toULength = (int8_t)length;
    439         args->source = (const char *)(mySource + length);
    440         *err = U_TRUNCATED_CHAR_FOUND;
    441         return 0xffff;
    442     }
    443 
    444     /* Don't even try to do a direct cast because the value may be on an odd address. */
    445     myUChar = ((UChar32)mySource[0] << 24)
    446             | ((UChar32)mySource[1] << 16)
    447             | ((UChar32)mySource[2] << 8)
    448             | ((UChar32)mySource[3]);
    449 
    450     args->source = (const char *)(mySource + 4);
    451     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
    452         return myUChar;
    453     }
    454 
    455     uprv_memcpy(args->converter->toUBytes, mySource, 4);
    456     args->converter->toULength = 4;
    457 
    458     *err = U_ILLEGAL_CHAR_FOUND;
    459     return 0xffff;
    460 }
    461 
    462 static const UConverterImpl _UTF32BEImpl = {
    463     UCNV_UTF32_BigEndian,
    464 
    465     NULL,
    466     NULL,
    467 
    468     NULL,
    469     NULL,
    470     NULL,
    471 
    472     T_UConverter_toUnicode_UTF32_BE,
    473     T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC,
    474     T_UConverter_fromUnicode_UTF32_BE,
    475     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
    476     T_UConverter_getNextUChar_UTF32_BE,
    477 
    478     NULL,
    479     NULL,
    480     NULL,
    481     NULL,
    482     ucnv_getNonSurrogateUnicodeSet
    483 };
    484 
    485 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
    486 static const UConverterStaticData _UTF32BEStaticData = {
    487     sizeof(UConverterStaticData),
    488     "UTF-32BE",
    489     1232,
    490     UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4,
    491     { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE,
    492     0,
    493     0,
    494     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
    495 };
    496 
    497 const UConverterSharedData _UTF32BEData = {
    498     sizeof(UConverterSharedData), ~((uint32_t) 0),
    499     NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl,
    500     0
    501 };
    502 
    503 /* UTF-32LE ---------------------------------------------------------- */
    504 
    505 static void
    506 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args,
    507                                 UErrorCode * err)
    508 {
    509     const unsigned char *mySource = (unsigned char *) args->source;
    510     UChar *myTarget = args->target;
    511     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    512     const UChar *targetLimit = args->targetLimit;
    513     unsigned char *toUBytes = args->converter->toUBytes;
    514     uint32_t ch, i;
    515 
    516     /* Restore state of current sequence */
    517     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
    518     {
    519         i = args->converter->toULength;       /* restore # of bytes consumed */
    520         args->converter->toULength = 0;
    521 
    522         /* Stores the previously calculated ch from a previous call*/
    523         ch = args->converter->toUnicodeStatus - 1;
    524         args->converter->toUnicodeStatus = 0;
    525         goto morebytes;
    526     }
    527 
    528     while (mySource < sourceLimit && myTarget < targetLimit)
    529     {
    530         i = 0;
    531         ch = 0;
    532 morebytes:
    533         while (i < sizeof(uint32_t))
    534         {
    535             if (mySource < sourceLimit)
    536             {
    537                 ch |= ((uint8_t)(*mySource)) << (i * 8);
    538                 toUBytes[i++] = (char) *(mySource++);
    539             }
    540             else
    541             {
    542                 /* stores a partially calculated target*/
    543                 /* + 1 to make 0 a valid character */
    544                 args->converter->toUnicodeStatus = ch + 1;
    545                 args->converter->toULength = (int8_t) i;
    546                 goto donefornow;
    547             }
    548         }
    549 
    550         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) {
    551             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    552             if (ch <= MAXIMUM_UCS2) {
    553                 /* fits in 16 bits */
    554                 *(myTarget++) = (UChar) ch;
    555             }
    556             else {
    557                 /* write out the surrogates */
    558                 *(myTarget++) = U16_LEAD(ch);
    559                 ch = U16_TRAIL(ch);
    560                 if (myTarget < targetLimit) {
    561                     *(myTarget++) = (UChar)ch;
    562                 }
    563                 else {
    564                     /* Put in overflow buffer (not handled here) */
    565                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
    566                     args->converter->UCharErrorBufferLength = 1;
    567                     *err = U_BUFFER_OVERFLOW_ERROR;
    568                     break;
    569                 }
    570             }
    571         }
    572         else {
    573             args->converter->toULength = (int8_t)i;
    574             *err = U_ILLEGAL_CHAR_FOUND;
    575             break;
    576         }
    577     }
    578 
    579 donefornow:
    580     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    581     {
    582         /* End of target buffer */
    583         *err = U_BUFFER_OVERFLOW_ERROR;
    584     }
    585 
    586     args->target = myTarget;
    587     args->source = (const char *) mySource;
    588 }
    589 
    590 static void
    591 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args,
    592                                              UErrorCode * err)
    593 {
    594     const unsigned char *mySource = (unsigned char *) args->source;
    595     UChar *myTarget = args->target;
    596     int32_t *myOffsets = args->offsets;
    597     const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit;
    598     const UChar *targetLimit = args->targetLimit;
    599     unsigned char *toUBytes = args->converter->toUBytes;
    600     uint32_t ch, i;
    601     int32_t offsetNum = 0;
    602 
    603     /* Restore state of current sequence */
    604     if (args->converter->toUnicodeStatus && myTarget < targetLimit)
    605     {
    606         i = args->converter->toULength;       /* restore # of bytes consumed */
    607         args->converter->toULength = 0;
    608 
    609         /* Stores the previously calculated ch from a previous call*/
    610         ch = args->converter->toUnicodeStatus - 1;
    611         args->converter->toUnicodeStatus = 0;
    612         goto morebytes;
    613     }
    614 
    615     while (mySource < sourceLimit && myTarget < targetLimit)
    616     {
    617         i = 0;
    618         ch = 0;
    619 morebytes:
    620         while (i < sizeof(uint32_t))
    621         {
    622             if (mySource < sourceLimit)
    623             {
    624                 ch |= ((uint8_t)(*mySource)) << (i * 8);
    625                 toUBytes[i++] = (char) *(mySource++);
    626             }
    627             else
    628             {
    629                 /* stores a partially calculated target*/
    630                 /* + 1 to make 0 a valid character */
    631                 args->converter->toUnicodeStatus = ch + 1;
    632                 args->converter->toULength = (int8_t) i;
    633                 goto donefornow;
    634             }
    635         }
    636 
    637         if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch))
    638         {
    639             /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */
    640             if (ch <= MAXIMUM_UCS2)
    641             {
    642                 /* fits in 16 bits */
    643                 *(myTarget++) = (UChar) ch;
    644                 *(myOffsets++) = offsetNum;
    645             }
    646             else {
    647                 /* write out the surrogates */
    648                 *(myTarget++) = U16_LEAD(ch);
    649                 *(myOffsets++) = offsetNum;
    650                 ch = U16_TRAIL(ch);
    651                 if (myTarget < targetLimit)
    652                 {
    653                     *(myTarget++) = (UChar)ch;
    654                     *(myOffsets++) = offsetNum;
    655                 }
    656                 else
    657                 {
    658                     /* Put in overflow buffer (not handled here) */
    659                     args->converter->UCharErrorBuffer[0] = (UChar) ch;
    660                     args->converter->UCharErrorBufferLength = 1;
    661                     *err = U_BUFFER_OVERFLOW_ERROR;
    662                     break;
    663                 }
    664             }
    665         }
    666         else
    667         {
    668             args->converter->toULength = (int8_t)i;
    669             *err = U_ILLEGAL_CHAR_FOUND;
    670             break;
    671         }
    672         offsetNum += i;
    673     }
    674 
    675 donefornow:
    676     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    677     {
    678         /* End of target buffer */
    679         *err = U_BUFFER_OVERFLOW_ERROR;
    680     }
    681 
    682     args->target = myTarget;
    683     args->source = (const char *) mySource;
    684     args->offsets = myOffsets;
    685 }
    686 
    687 static void
    688 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args,
    689                                   UErrorCode * err)
    690 {
    691     const UChar *mySource = args->source;
    692     unsigned char *myTarget;
    693     const UChar *sourceLimit = args->sourceLimit;
    694     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    695     UChar32 ch, ch2;
    696     unsigned int indexToWrite;
    697     unsigned char temp[sizeof(uint32_t)];
    698 
    699     if(mySource >= sourceLimit) {
    700         /* no input, nothing to do */
    701         return;
    702     }
    703 
    704     /* write the BOM if necessary */
    705     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    706         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
    707         ucnv_fromUWriteBytes(args->converter,
    708                              bom, 4,
    709                              &args->target, args->targetLimit,
    710                              &args->offsets, -1,
    711                              err);
    712         args->converter->fromUnicodeStatus=0;
    713     }
    714 
    715     myTarget = (unsigned char *) args->target;
    716     temp[3] = 0;
    717 
    718     if (args->converter->fromUChar32)
    719     {
    720         ch = args->converter->fromUChar32;
    721         args->converter->fromUChar32 = 0;
    722         goto lowsurogate;
    723     }
    724 
    725     while (mySource < sourceLimit && myTarget < targetLimit)
    726     {
    727         ch = *(mySource++);
    728 
    729         if (UTF_IS_SURROGATE(ch)) {
    730             if (U_IS_LEAD(ch))
    731             {
    732 lowsurogate:
    733                 if (mySource < sourceLimit)
    734                 {
    735                     ch2 = *mySource;
    736                     if (U_IS_TRAIL(ch2)) {
    737                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
    738                         mySource++;
    739                     }
    740                     else {
    741                         /* this is an unmatched trail code unit (2nd surrogate) */
    742                         /* callback(illegal) */
    743                         args->converter->fromUChar32 = ch;
    744                         *err = U_ILLEGAL_CHAR_FOUND;
    745                         break;
    746                     }
    747                 }
    748                 else {
    749                     /* ran out of source */
    750                     args->converter->fromUChar32 = ch;
    751                     if (args->flush) {
    752                         /* this is an unmatched trail code unit (2nd surrogate) */
    753                         /* callback(illegal) */
    754                         *err = U_ILLEGAL_CHAR_FOUND;
    755                     }
    756                     break;
    757                 }
    758             }
    759             else {
    760                 /* this is an unmatched trail code unit (2nd surrogate) */
    761                 /* callback(illegal) */
    762                 args->converter->fromUChar32 = ch;
    763                 *err = U_ILLEGAL_CHAR_FOUND;
    764                 break;
    765             }
    766         }
    767 
    768         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    769         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
    770         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
    771         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
    772 
    773         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
    774         {
    775             if (myTarget < targetLimit)
    776             {
    777                 *(myTarget++) = temp[indexToWrite];
    778             }
    779             else
    780             {
    781                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
    782                 *err = U_BUFFER_OVERFLOW_ERROR;
    783             }
    784         }
    785     }
    786 
    787     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    788     {
    789         *err = U_BUFFER_OVERFLOW_ERROR;
    790     }
    791 
    792     args->target = (char *) myTarget;
    793     args->source = mySource;
    794 }
    795 
    796 static void
    797 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args,
    798                                                UErrorCode * err)
    799 {
    800     const UChar *mySource = args->source;
    801     unsigned char *myTarget;
    802     int32_t *myOffsets;
    803     const UChar *sourceLimit = args->sourceLimit;
    804     const unsigned char *targetLimit = (unsigned char *) args->targetLimit;
    805     UChar32 ch, ch2;
    806     unsigned int indexToWrite;
    807     unsigned char temp[sizeof(uint32_t)];
    808     int32_t offsetNum = 0;
    809 
    810     if(mySource >= sourceLimit) {
    811         /* no input, nothing to do */
    812         return;
    813     }
    814 
    815     /* write the BOM if necessary */
    816     if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) {
    817         static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 };
    818         ucnv_fromUWriteBytes(args->converter,
    819                              bom, 4,
    820                              &args->target, args->targetLimit,
    821                              &args->offsets, -1,
    822                              err);
    823         args->converter->fromUnicodeStatus=0;
    824     }
    825 
    826     myTarget = (unsigned char *) args->target;
    827     myOffsets = args->offsets;
    828     temp[3] = 0;
    829 
    830     if (args->converter->fromUChar32)
    831     {
    832         ch = args->converter->fromUChar32;
    833         args->converter->fromUChar32 = 0;
    834         goto lowsurogate;
    835     }
    836 
    837     while (mySource < sourceLimit && myTarget < targetLimit)
    838     {
    839         ch = *(mySource++);
    840 
    841         if (UTF_IS_SURROGATE(ch)) {
    842             if (U_IS_LEAD(ch))
    843             {
    844 lowsurogate:
    845                 if (mySource < sourceLimit)
    846                 {
    847                     ch2 = *mySource;
    848                     if (U_IS_TRAIL(ch2))
    849                     {
    850                         ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE;
    851                         mySource++;
    852                     }
    853                     else {
    854                         /* this is an unmatched trail code unit (2nd surrogate) */
    855                         /* callback(illegal) */
    856                         args->converter->fromUChar32 = ch;
    857                         *err = U_ILLEGAL_CHAR_FOUND;
    858                         break;
    859                     }
    860                 }
    861                 else {
    862                     /* ran out of source */
    863                     args->converter->fromUChar32 = ch;
    864                     if (args->flush) {
    865                         /* this is an unmatched trail code unit (2nd surrogate) */
    866                         /* callback(illegal) */
    867                         *err = U_ILLEGAL_CHAR_FOUND;
    868                     }
    869                     break;
    870                 }
    871             }
    872             else {
    873                 /* this is an unmatched trail code unit (2nd surrogate) */
    874                 /* callback(illegal) */
    875                 args->converter->fromUChar32 = ch;
    876                 *err = U_ILLEGAL_CHAR_FOUND;
    877                 break;
    878             }
    879         }
    880 
    881         /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    882         temp[2] = (uint8_t) (ch >> 16 & 0x1F);
    883         temp[1] = (uint8_t) (ch >> 8);  /* unsigned cast implicitly does (ch & FF) */
    884         temp[0] = (uint8_t) (ch);       /* unsigned cast implicitly does (ch & FF) */
    885 
    886         for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++)
    887         {
    888             if (myTarget < targetLimit)
    889             {
    890                 *(myTarget++) = temp[indexToWrite];
    891                 *(myOffsets++) = offsetNum;
    892             }
    893             else
    894             {
    895                 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite];
    896                 *err = U_BUFFER_OVERFLOW_ERROR;
    897             }
    898         }
    899         offsetNum = offsetNum + 1 + (temp[2] != 0);
    900     }
    901 
    902     if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err))
    903     {
    904         *err = U_BUFFER_OVERFLOW_ERROR;
    905     }
    906 
    907     args->target = (char *) myTarget;
    908     args->source = mySource;
    909     args->offsets = myOffsets;
    910 }
    911 
    912 static UChar32
    913 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args,
    914                                    UErrorCode* err)
    915 {
    916     const uint8_t *mySource;
    917     UChar32 myUChar;
    918     int32_t length;
    919 
    920     mySource = (const uint8_t *)args->source;
    921     if (mySource >= (const uint8_t *)args->sourceLimit)
    922     {
    923         /* no input */
    924         *err = U_INDEX_OUTOFBOUNDS_ERROR;
    925         return 0xffff;
    926     }
    927 
    928     length = (int32_t)((const uint8_t *)args->sourceLimit - mySource);
    929     if (length < 4)
    930     {
    931         /* got a partial character */
    932         uprv_memcpy(args->converter->toUBytes, mySource, length);
    933         args->converter->toULength = (int8_t)length;
    934         args->source = (const char *)(mySource + length);
    935         *err = U_TRUNCATED_CHAR_FOUND;
    936         return 0xffff;
    937     }
    938 
    939     /* Don't even try to do a direct cast because the value may be on an odd address. */
    940     myUChar = ((UChar32)mySource[3] << 24)
    941             | ((UChar32)mySource[2] << 16)
    942             | ((UChar32)mySource[1] << 8)
    943             | ((UChar32)mySource[0]);
    944 
    945     args->source = (const char *)(mySource + 4);
    946     if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) {
    947         return myUChar;
    948     }
    949 
    950     uprv_memcpy(args->converter->toUBytes, mySource, 4);
    951     args->converter->toULength = 4;
    952 
    953     *err = U_ILLEGAL_CHAR_FOUND;
    954     return 0xffff;
    955 }
    956 
    957 static const UConverterImpl _UTF32LEImpl = {
    958     UCNV_UTF32_LittleEndian,
    959 
    960     NULL,
    961     NULL,
    962 
    963     NULL,
    964     NULL,
    965     NULL,
    966 
    967     T_UConverter_toUnicode_UTF32_LE,
    968     T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC,
    969     T_UConverter_fromUnicode_UTF32_LE,
    970     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
    971     T_UConverter_getNextUChar_UTF32_LE,
    972 
    973     NULL,
    974     NULL,
    975     NULL,
    976     NULL,
    977     ucnv_getNonSurrogateUnicodeSet
    978 };
    979 
    980 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */
    981 static const UConverterStaticData _UTF32LEStaticData = {
    982     sizeof(UConverterStaticData),
    983     "UTF-32LE",
    984     1234,
    985     UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4,
    986     { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE,
    987     0,
    988     0,
    989     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
    990 };
    991 
    992 
    993 const UConverterSharedData _UTF32LEData = {
    994     sizeof(UConverterSharedData), ~((uint32_t) 0),
    995     NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl,
    996     0
    997 };
    998 
    999 /* UTF-32 (Detect BOM) ------------------------------------------------------ */
   1000 
   1001 /*
   1002  * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE
   1003  * accordingly.
   1004  *
   1005  * State values:
   1006  * 0    initial state
   1007  * 1    saw 00
   1008  * 2    saw 00 00
   1009  * 3    saw 00 00 FE
   1010  * 4    -
   1011  * 5    saw FF
   1012  * 6    saw FF FE
   1013  * 7    saw FF FE 00
   1014  * 8    UTF-32BE mode
   1015  * 9    UTF-32LE mode
   1016  *
   1017  * During detection: state&3==number of matching bytes so far.
   1018  *
   1019  * On output, emit U+FEFF as the first code point.
   1020  */
   1021 
   1022 static void
   1023 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) {
   1024     if(choice<=UCNV_RESET_TO_UNICODE) {
   1025         /* reset toUnicode: state=0 */
   1026         cnv->mode=0;
   1027     }
   1028     if(choice!=UCNV_RESET_TO_UNICODE) {
   1029         /* reset fromUnicode: prepare to output the UTF-32PE BOM */
   1030         cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM;
   1031     }
   1032 }
   1033 
   1034 static void
   1035 _UTF32Open(UConverter *cnv,
   1036            UConverterLoadArgs *pArgs,
   1037            UErrorCode *pErrorCode) {
   1038     _UTF32Reset(cnv, UCNV_RESET_BOTH);
   1039 }
   1040 
   1041 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff,    (char)0xff, (char)0xfe, 0, 0 };
   1042 
   1043 static void
   1044 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
   1045                            UErrorCode *pErrorCode) {
   1046     UConverter *cnv=pArgs->converter;
   1047     const char *source=pArgs->source;
   1048     const char *sourceLimit=pArgs->sourceLimit;
   1049     int32_t *offsets=pArgs->offsets;
   1050 
   1051     int32_t state, offsetDelta;
   1052     char b;
   1053 
   1054     state=cnv->mode;
   1055 
   1056     /*
   1057      * If we detect a BOM in this buffer, then we must add the BOM size to the
   1058      * offsets because the actual converter function will not see and count the BOM.
   1059      * offsetDelta will have the number of the BOM bytes that are in the current buffer.
   1060      */
   1061     offsetDelta=0;
   1062 
   1063     while(source<sourceLimit && U_SUCCESS(*pErrorCode)) {
   1064         switch(state) {
   1065         case 0:
   1066             b=*source;
   1067             if(b==0) {
   1068                 state=1; /* could be 00 00 FE FF */
   1069             } else if(b==(char)0xff) {
   1070                 state=5; /* could be FF FE 00 00 */
   1071             } else {
   1072                 state=8; /* default to UTF-32BE */
   1073                 continue;
   1074             }
   1075             ++source;
   1076             break;
   1077         case 1:
   1078         case 2:
   1079         case 3:
   1080         case 5:
   1081         case 6:
   1082         case 7:
   1083             if(*source==utf32BOM[state]) {
   1084                 ++state;
   1085                 ++source;
   1086                 if(state==4) {
   1087                     state=8; /* detect UTF-32BE */
   1088                     offsetDelta=(int32_t)(source-pArgs->source);
   1089                 } else if(state==8) {
   1090                     state=9; /* detect UTF-32LE */
   1091                     offsetDelta=(int32_t)(source-pArgs->source);
   1092                 }
   1093             } else {
   1094                 /* switch to UTF-32BE and pass the previous bytes */
   1095                 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */
   1096 
   1097                 /* reset the source */
   1098                 source=pArgs->source;
   1099 
   1100                 if(count==(state&3)) {
   1101                     /* simple: all in the same buffer, just reset source */
   1102                 } else {
   1103                     UBool oldFlush=pArgs->flush;
   1104 
   1105                     /* some of the bytes are from a previous buffer, replay those first */
   1106                     pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
   1107                     pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */
   1108                     pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */
   1109 
   1110                     /* no offsets: bytes from previous buffer, and not enough for output */
   1111                     T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
   1112 
   1113                     /* restore real pointers; pArgs->source will be set in case 8/9 */
   1114                     pArgs->sourceLimit=sourceLimit;
   1115                     pArgs->flush=oldFlush;
   1116                 }
   1117                 state=8;
   1118                 continue;
   1119             }
   1120             break;
   1121         case 8:
   1122             /* call UTF-32BE */
   1123             pArgs->source=source;
   1124             if(offsets==NULL) {
   1125                 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
   1126             } else {
   1127                 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode);
   1128             }
   1129             source=pArgs->source;
   1130             break;
   1131         case 9:
   1132             /* call UTF-32LE */
   1133             pArgs->source=source;
   1134             if(offsets==NULL) {
   1135                 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
   1136             } else {
   1137                 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode);
   1138             }
   1139             source=pArgs->source;
   1140             break;
   1141         default:
   1142             break; /* does not occur */
   1143         }
   1144     }
   1145 
   1146     /* add BOM size to offsets - see comment at offsetDelta declaration */
   1147     if(offsets!=NULL && offsetDelta!=0) {
   1148         int32_t *offsetsLimit=pArgs->offsets;
   1149         while(offsets<offsetsLimit) {
   1150             *offsets++ += offsetDelta;
   1151         }
   1152     }
   1153 
   1154     pArgs->source=source;
   1155 
   1156     if(source==sourceLimit && pArgs->flush) {
   1157         /* handle truncated input */
   1158         switch(state) {
   1159         case 0:
   1160             break; /* no input at all, nothing to do */
   1161         case 8:
   1162             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
   1163             break;
   1164         case 9:
   1165             T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode);
   1166             break;
   1167         default:
   1168             /* handle 0<state<8: call UTF-32BE with too-short input */
   1169             pArgs->source=utf32BOM+(state&4); /* select the correct BOM */
   1170             pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */
   1171 
   1172             /* no offsets: not enough for output */
   1173             T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode);
   1174             pArgs->source=source;
   1175             pArgs->sourceLimit=sourceLimit;
   1176             state=8;
   1177             break;
   1178         }
   1179     }
   1180 
   1181     cnv->mode=state;
   1182 }
   1183 
   1184 static UChar32
   1185 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs,
   1186                    UErrorCode *pErrorCode) {
   1187     switch(pArgs->converter->mode) {
   1188     case 8:
   1189         return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode);
   1190     case 9:
   1191         return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode);
   1192     default:
   1193         return UCNV_GET_NEXT_UCHAR_USE_TO_U;
   1194     }
   1195 }
   1196 
   1197 static const UConverterImpl _UTF32Impl = {
   1198     UCNV_UTF32,
   1199 
   1200     NULL,
   1201     NULL,
   1202 
   1203     _UTF32Open,
   1204     NULL,
   1205     _UTF32Reset,
   1206 
   1207     _UTF32ToUnicodeWithOffsets,
   1208     _UTF32ToUnicodeWithOffsets,
   1209 #if U_IS_BIG_ENDIAN
   1210     T_UConverter_fromUnicode_UTF32_BE,
   1211     T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC,
   1212 #else
   1213     T_UConverter_fromUnicode_UTF32_LE,
   1214     T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC,
   1215 #endif
   1216     _UTF32GetNextUChar,
   1217 
   1218     NULL, /* ### TODO implement getStarters for all Unicode encodings?! */
   1219     NULL,
   1220     NULL,
   1221     NULL,
   1222     ucnv_getNonSurrogateUnicodeSet
   1223 };
   1224 
   1225 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */
   1226 static const UConverterStaticData _UTF32StaticData = {
   1227     sizeof(UConverterStaticData),
   1228     "UTF-32",
   1229     1236,
   1230     UCNV_IBM, UCNV_UTF32, 4, 4,
   1231 #if U_IS_BIG_ENDIAN
   1232     { 0, 0, 0xff, 0xfd }, 4,
   1233 #else
   1234     { 0xfd, 0xff, 0, 0 }, 4,
   1235 #endif
   1236     FALSE, FALSE,
   1237     0,
   1238     0,
   1239     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1240 };
   1241 
   1242 const UConverterSharedData _UTF32Data = {
   1243     sizeof(UConverterSharedData), ~((uint32_t) 0),
   1244     NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl,
   1245     0
   1246 };
   1247 
   1248 #endif
   1249