Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2002-2005, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  ucnvbocu.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002mar27
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This is an implementation of the Binary Ordered Compression for Unicode,
     17 *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 
     22 #if !UCONFIG_NO_CONVERSION
     23 
     24 #include "unicode/ucnv.h"
     25 #include "unicode/ucnv_cb.h"
     26 #include "ucnv_bld.h"
     27 #include "ucnv_cnv.h"
     28 
     29 /* BOCU-1 constants and macros ---------------------------------------------- */
     30 
     31 /*
     32  * BOCU-1 encodes the code points of a Unicode string as
     33  * a sequence of byte-encoded differences (slope detection),
     34  * preserving lexical order.
     35  *
     36  * Optimize the difference-taking for runs of Unicode text within
     37  * small scripts:
     38  *
     39  * Most small scripts are allocated within aligned 128-blocks of Unicode
     40  * code points. Lexical order is preserved if the "previous code point" state
     41  * is always moved into the middle of such a block.
     42  *
     43  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
     44  * areas into the middle of those areas.
     45  *
     46  * C0 control codes and space are encoded with their US-ASCII bytes.
     47  * "prev" is reset for C0 controls but not for space.
     48  */
     49 
     50 /* initial value for "prev": middle of the ASCII range */
     51 #define BOCU1_ASCII_PREV        0x40
     52 
     53 /* bounding byte values for differences */
     54 #define BOCU1_MIN               0x21
     55 #define BOCU1_MIDDLE            0x90
     56 #define BOCU1_MAX_LEAD          0xfe
     57 #define BOCU1_MAX_TRAIL         0xff
     58 #define BOCU1_RESET             0xff
     59 
     60 /* number of lead bytes */
     61 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
     62 
     63 /* adjust trail byte counts for the use of some C0 control byte values */
     64 #define BOCU1_TRAIL_CONTROLS_COUNT  20
     65 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
     66 
     67 /* number of trail bytes */
     68 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
     69 
     70 /*
     71  * number of positive and negative single-byte codes
     72  * (counting 0==BOCU1_MIDDLE among the positive ones)
     73  */
     74 #define BOCU1_SINGLE            64
     75 
     76 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
     77 #define BOCU1_LEAD_2            43
     78 #define BOCU1_LEAD_3            3
     79 #define BOCU1_LEAD_4            1
     80 
     81 /* The difference value range for single-byters. */
     82 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
     83 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
     84 
     85 /* The difference value range for double-byters. */
     86 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
     87 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
     88 
     89 /* The difference value range for 3-byters. */
     90 #define BOCU1_REACH_POS_3   \
     91     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
     92 
     93 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
     94 
     95 /* The lead byte start values. */
     96 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
     97 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
     98 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
     99      /* ==BOCU1_MAX_LEAD */
    100 
    101 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
    102 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
    103 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
    104      /* ==BOCU1_MIN+1 */
    105 
    106 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
    107 #define BOCU1_LENGTH_FROM_LEAD(lead) \
    108     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
    109      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
    110      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
    111 
    112 /* The length of a byte sequence, according to its packed form. */
    113 #define BOCU1_LENGTH_FROM_PACKED(packed) \
    114     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
    115 
    116 /*
    117  * 12 commonly used C0 control codes (and space) are only used to encode
    118  * themselves directly,
    119  * which makes BOCU-1 MIME-usable and reasonably safe for
    120  * ASCII-oriented software.
    121  *
    122  * These controls are
    123  *  0   NUL
    124  *
    125  *  7   BEL
    126  *  8   BS
    127  *
    128  *  9   TAB
    129  *  a   LF
    130  *  b   VT
    131  *  c   FF
    132  *  d   CR
    133  *
    134  *  e   SO
    135  *  f   SI
    136  *
    137  * 1a   SUB
    138  * 1b   ESC
    139  *
    140  * The other 20 C0 controls are also encoded directly (to preserve order)
    141  * but are also used as trail bytes in difference encoding
    142  * (for better compression).
    143  */
    144 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
    145 
    146 /*
    147  * Byte value map for control codes,
    148  * from external byte values 0x00..0x20
    149  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
    150  * External byte values that are illegal as trail bytes are mapped to -1.
    151  */
    152 static const int8_t
    153 bocu1ByteToTrail[BOCU1_MIN]={
    154 /*  0     1     2     3     4     5     6     7    */
    155     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
    156 
    157 /*  8     9     a     b     c     d     e     f    */
    158     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
    159 
    160 /*  10    11    12    13    14    15    16    17   */
    161     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
    162 
    163 /*  18    19    1a    1b    1c    1d    1e    1f   */
    164     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
    165 
    166 /*  20   */
    167     -1
    168 };
    169 
    170 /*
    171  * Byte value map for control codes,
    172  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
    173  * to external byte values 0x00..0x20.
    174  */
    175 static const int8_t
    176 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
    177 /*  0     1     2     3     4     5     6     7    */
    178     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
    179 
    180 /*  8     9     a     b     c     d     e     f    */
    181     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
    182 
    183 /*  10    11    12    13   */
    184     0x1c, 0x1d, 0x1e, 0x1f
    185 };
    186 
    187 /**
    188  * Integer division and modulo with negative numerators
    189  * yields negative modulo results and quotients that are one more than
    190  * what we need here.
    191  * This macro adjust the results so that the modulo-value m is always >=0.
    192  *
    193  * For positive n, the if() condition is always FALSE.
    194  *
    195  * @param n Number to be split into quotient and rest.
    196  *          Will be modified to contain the quotient.
    197  * @param d Divisor.
    198  * @param m Output variable for the rest (modulo result).
    199  */
    200 #define NEGDIVMOD(n, d, m) { \
    201     (m)=(n)%(d); \
    202     (n)/=(d); \
    203     if((m)<0) { \
    204         --(n); \
    205         (m)+=(d); \
    206     } \
    207 }
    208 
    209 /* BOCU-1 implementation functions ------------------------------------------ */
    210 
    211 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
    212 
    213 /**
    214  * Compute the next "previous" value for differencing
    215  * from the current code point.
    216  *
    217  * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
    218  * @return "previous code point" state value
    219  */
    220 static U_INLINE int32_t
    221 bocu1Prev(int32_t c) {
    222     /* compute new prev */
    223     if(/* 0x3040<=c && */ c<=0x309f) {
    224         /* Hiragana is not 128-aligned */
    225         return 0x3070;
    226     } else if(0x4e00<=c && c<=0x9fa5) {
    227         /* CJK Unihan */
    228         return 0x4e00-BOCU1_REACH_NEG_2;
    229     } else if(0xac00<=c /* && c<=0xd7a3 */) {
    230         /* Korean Hangul */
    231         return (0xd7a3+0xac00)/2;
    232     } else {
    233         /* mostly small scripts */
    234         return BOCU1_SIMPLE_PREV(c);
    235     }
    236 }
    237 
    238 /** Fast version of bocu1Prev() for most scripts. */
    239 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
    240 
    241 /*
    242  * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
    243  * The UConverter fields are used as follows:
    244  *
    245  * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
    246  *
    247  * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
    248  * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
    249  */
    250 
    251 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
    252 
    253 /**
    254  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
    255  * and return a packed integer with them.
    256  *
    257  * The encoding favors small absolut differences with short encodings
    258  * to compress runs of same-script characters.
    259  *
    260  * Optimized version with unrolled loops and fewer floating-point operations
    261  * than the standard packDiff().
    262  *
    263  * @param diff difference value -0x10ffff..0x10ffff
    264  * @return
    265  *      0x010000zz for 1-byte sequence zz
    266  *      0x0200yyzz for 2-byte sequence yy zz
    267  *      0x03xxyyzz for 3-byte sequence xx yy zz
    268  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
    269  */
    270 static int32_t
    271 packDiff(int32_t diff) {
    272     int32_t result, m;
    273 
    274     if(diff>=BOCU1_REACH_NEG_1) {
    275         /* mostly positive differences, and single-byte negative ones */
    276 #if 0   /* single-byte case handled in macros, see below */
    277         if(diff<=BOCU1_REACH_POS_1) {
    278             /* single byte */
    279             return 0x01000000|(BOCU1_MIDDLE+diff);
    280         } else
    281 #endif
    282         if(diff<=BOCU1_REACH_POS_2) {
    283             /* two bytes */
    284             diff-=BOCU1_REACH_POS_1+1;
    285             result=0x02000000;
    286 
    287             m=diff%BOCU1_TRAIL_COUNT;
    288             diff/=BOCU1_TRAIL_COUNT;
    289             result|=BOCU1_TRAIL_TO_BYTE(m);
    290 
    291             result|=(BOCU1_START_POS_2+diff)<<8;
    292         } else if(diff<=BOCU1_REACH_POS_3) {
    293             /* three bytes */
    294             diff-=BOCU1_REACH_POS_2+1;
    295             result=0x03000000;
    296 
    297             m=diff%BOCU1_TRAIL_COUNT;
    298             diff/=BOCU1_TRAIL_COUNT;
    299             result|=BOCU1_TRAIL_TO_BYTE(m);
    300 
    301             m=diff%BOCU1_TRAIL_COUNT;
    302             diff/=BOCU1_TRAIL_COUNT;
    303             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    304 
    305             result|=(BOCU1_START_POS_3+diff)<<16;
    306         } else {
    307             /* four bytes */
    308             diff-=BOCU1_REACH_POS_3+1;
    309 
    310             m=diff%BOCU1_TRAIL_COUNT;
    311             diff/=BOCU1_TRAIL_COUNT;
    312             result=BOCU1_TRAIL_TO_BYTE(m);
    313 
    314             m=diff%BOCU1_TRAIL_COUNT;
    315             diff/=BOCU1_TRAIL_COUNT;
    316             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    317 
    318             /*
    319              * We know that / and % would deliver quotient 0 and rest=diff.
    320              * Avoid division and modulo for performance.
    321              */
    322             result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
    323 
    324             result|=((uint32_t)BOCU1_START_POS_4)<<24;
    325         }
    326     } else {
    327         /* two- to four-byte negative differences */
    328         if(diff>=BOCU1_REACH_NEG_2) {
    329             /* two bytes */
    330             diff-=BOCU1_REACH_NEG_1;
    331             result=0x02000000;
    332 
    333             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    334             result|=BOCU1_TRAIL_TO_BYTE(m);
    335 
    336             result|=(BOCU1_START_NEG_2+diff)<<8;
    337         } else if(diff>=BOCU1_REACH_NEG_3) {
    338             /* three bytes */
    339             diff-=BOCU1_REACH_NEG_2;
    340             result=0x03000000;
    341 
    342             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    343             result|=BOCU1_TRAIL_TO_BYTE(m);
    344 
    345             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    346             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    347 
    348             result|=(BOCU1_START_NEG_3+diff)<<16;
    349         } else {
    350             /* four bytes */
    351             diff-=BOCU1_REACH_NEG_3;
    352 
    353             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    354             result=BOCU1_TRAIL_TO_BYTE(m);
    355 
    356             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    357             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    358 
    359             /*
    360              * We know that NEGDIVMOD would deliver
    361              * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
    362              * Avoid division and modulo for performance.
    363              */
    364             m=diff+BOCU1_TRAIL_COUNT;
    365             result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
    366 
    367             result|=BOCU1_MIN<<24;
    368         }
    369     }
    370     return result;
    371 }
    372 
    373 /* Faster versions of packDiff() for single-byte-encoded diff values. */
    374 
    375 /** Is a diff value encodable in a single byte? */
    376 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
    377 
    378 /** Encode a diff value in a single byte. */
    379 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
    380 
    381 /** Is a diff value encodable in two bytes? */
    382 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
    383 
    384 static void
    385 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
    386                              UErrorCode *pErrorCode) {
    387     UConverter *cnv;
    388     const UChar *source, *sourceLimit;
    389     uint8_t *target;
    390     int32_t targetCapacity;
    391     int32_t *offsets;
    392 
    393     int32_t prev, c, diff;
    394 
    395     int32_t sourceIndex, nextSourceIndex;
    396 
    397 U_ALIGN_CODE(16)
    398 
    399     /* set up the local pointers */
    400     cnv=pArgs->converter;
    401     source=pArgs->source;
    402     sourceLimit=pArgs->sourceLimit;
    403     target=(uint8_t *)pArgs->target;
    404     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
    405     offsets=pArgs->offsets;
    406 
    407     /* get the converter state from UConverter */
    408     c=cnv->fromUChar32;
    409     prev=(int32_t)cnv->fromUnicodeStatus;
    410     if(prev==0) {
    411         prev=BOCU1_ASCII_PREV;
    412     }
    413 
    414     /* sourceIndex=-1 if the current character began in the previous buffer */
    415     sourceIndex= c==0 ? 0 : -1;
    416     nextSourceIndex=0;
    417 
    418     /* conversion loop */
    419     if(c!=0 && targetCapacity>0) {
    420         goto getTrail;
    421     }
    422 
    423 fastSingle:
    424     /* fast loop for single-byte differences */
    425     /* use only one loop counter variable, targetCapacity, not also source */
    426     diff=(int32_t)(sourceLimit-source);
    427     if(targetCapacity>diff) {
    428         targetCapacity=diff;
    429     }
    430     while(targetCapacity>0 && (c=*source)<0x3000) {
    431         if(c<=0x20) {
    432             if(c!=0x20) {
    433                 prev=BOCU1_ASCII_PREV;
    434             }
    435             *target++=(uint8_t)c;
    436             *offsets++=nextSourceIndex++;
    437             ++source;
    438             --targetCapacity;
    439         } else {
    440             diff=c-prev;
    441             if(DIFF_IS_SINGLE(diff)) {
    442                 prev=BOCU1_SIMPLE_PREV(c);
    443                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
    444                 *offsets++=nextSourceIndex++;
    445                 ++source;
    446                 --targetCapacity;
    447             } else {
    448                 break;
    449             }
    450         }
    451     }
    452     /* restore real values */
    453     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
    454     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
    455 
    456     /* regular loop for all cases */
    457     while(source<sourceLimit) {
    458         if(targetCapacity>0) {
    459             c=*source++;
    460             ++nextSourceIndex;
    461 
    462             if(c<=0x20) {
    463                 /*
    464                  * ISO C0 control & space:
    465                  * Encode directly for MIME compatibility,
    466                  * and reset state except for space, to not disrupt compression.
    467                  */
    468                 if(c!=0x20) {
    469                     prev=BOCU1_ASCII_PREV;
    470                 }
    471                 *target++=(uint8_t)c;
    472                 *offsets++=sourceIndex;
    473                 --targetCapacity;
    474 
    475                 sourceIndex=nextSourceIndex;
    476                 continue;
    477             }
    478 
    479             if(UTF_IS_LEAD(c)) {
    480 getTrail:
    481                 if(source<sourceLimit) {
    482                     /* test the following code unit */
    483                     UChar trail=*source;
    484                     if(UTF_IS_SECOND_SURROGATE(trail)) {
    485                         ++source;
    486                         ++nextSourceIndex;
    487                         c=UTF16_GET_PAIR_VALUE(c, trail);
    488                     }
    489                 } else {
    490                     /* no more input */
    491                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
    492                     break;
    493                 }
    494             }
    495 
    496             /*
    497              * all other Unicode code points c==U+0021..U+10ffff
    498              * are encoded with the difference c-prev
    499              *
    500              * a new prev is computed from c,
    501              * placed in the middle of a 0x80-block (for most small scripts) or
    502              * in the middle of the Unihan and Hangul blocks
    503              * to statistically minimize the following difference
    504              */
    505             diff=c-prev;
    506             prev=BOCU1_PREV(c);
    507             if(DIFF_IS_SINGLE(diff)) {
    508                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
    509                 *offsets++=sourceIndex;
    510                 --targetCapacity;
    511                 sourceIndex=nextSourceIndex;
    512                 if(c<0x3000) {
    513                     goto fastSingle;
    514                 }
    515             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
    516                 /* optimize 2-byte case */
    517                 int32_t m;
    518 
    519                 if(diff>=0) {
    520                     diff-=BOCU1_REACH_POS_1+1;
    521                     m=diff%BOCU1_TRAIL_COUNT;
    522                     diff/=BOCU1_TRAIL_COUNT;
    523                     diff+=BOCU1_START_POS_2;
    524                 } else {
    525                     diff-=BOCU1_REACH_NEG_1;
    526                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    527                     diff+=BOCU1_START_NEG_2;
    528                 }
    529                 *target++=(uint8_t)diff;
    530                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
    531                 *offsets++=sourceIndex;
    532                 *offsets++=sourceIndex;
    533                 targetCapacity-=2;
    534                 sourceIndex=nextSourceIndex;
    535             } else {
    536                 int32_t length; /* will be 2..4 */
    537 
    538                 diff=packDiff(diff);
    539                 length=BOCU1_LENGTH_FROM_PACKED(diff);
    540 
    541                 /* write the output character bytes from diff and length */
    542                 /* from the first if in the loop we know that targetCapacity>0 */
    543                 if(length<=targetCapacity) {
    544                     switch(length) {
    545                         /* each branch falls through to the next one */
    546                     case 4:
    547                         *target++=(uint8_t)(diff>>24);
    548                         *offsets++=sourceIndex;
    549                     case 3:
    550                         *target++=(uint8_t)(diff>>16);
    551                         *offsets++=sourceIndex;
    552                     case 2:
    553                         *target++=(uint8_t)(diff>>8);
    554                         *offsets++=sourceIndex;
    555                     /* case 1: handled above */
    556                         *target++=(uint8_t)diff;
    557                         *offsets++=sourceIndex;
    558                     default:
    559                         /* will never occur */
    560                         break;
    561                     }
    562                     targetCapacity-=length;
    563                     sourceIndex=nextSourceIndex;
    564                 } else {
    565                     uint8_t *charErrorBuffer;
    566 
    567                     /*
    568                      * We actually do this backwards here:
    569                      * In order to save an intermediate variable, we output
    570                      * first to the overflow buffer what does not fit into the
    571                      * regular target.
    572                      */
    573                     /* we know that 1<=targetCapacity<length<=4 */
    574                     length-=targetCapacity;
    575                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
    576                     switch(length) {
    577                         /* each branch falls through to the next one */
    578                     case 3:
    579                         *charErrorBuffer++=(uint8_t)(diff>>16);
    580                     case 2:
    581                         *charErrorBuffer++=(uint8_t)(diff>>8);
    582                     case 1:
    583                         *charErrorBuffer=(uint8_t)diff;
    584                     default:
    585                         /* will never occur */
    586                         break;
    587                     }
    588                     cnv->charErrorBufferLength=(int8_t)length;
    589 
    590                     /* now output what fits into the regular target */
    591                     diff>>=8*length; /* length was reduced by targetCapacity */
    592                     switch(targetCapacity) {
    593                         /* each branch falls through to the next one */
    594                     case 3:
    595                         *target++=(uint8_t)(diff>>16);
    596                         *offsets++=sourceIndex;
    597                     case 2:
    598                         *target++=(uint8_t)(diff>>8);
    599                         *offsets++=sourceIndex;
    600                     case 1:
    601                         *target++=(uint8_t)diff;
    602                         *offsets++=sourceIndex;
    603                     default:
    604                         /* will never occur */
    605                         break;
    606                     }
    607 
    608                     /* target overflow */
    609                     targetCapacity=0;
    610                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    611                     break;
    612                 }
    613             }
    614         } else {
    615             /* target is full */
    616             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    617             break;
    618         }
    619     }
    620 
    621     /* set the converter state back into UConverter */
    622     cnv->fromUChar32= c<0 ? -c : 0;
    623     cnv->fromUnicodeStatus=(uint32_t)prev;
    624 
    625     /* write back the updated pointers */
    626     pArgs->source=source;
    627     pArgs->target=(char *)target;
    628     pArgs->offsets=offsets;
    629 }
    630 
    631 /*
    632  * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
    633  * If a change is made in the original function, then either
    634  * change this function the same way or
    635  * re-copy the original function and remove the variables
    636  * offsets, sourceIndex, and nextSourceIndex.
    637  */
    638 static void
    639 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
    640                   UErrorCode *pErrorCode) {
    641     UConverter *cnv;
    642     const UChar *source, *sourceLimit;
    643     uint8_t *target;
    644     int32_t targetCapacity;
    645 
    646     int32_t prev, c, diff;
    647 
    648     /* set up the local pointers */
    649     cnv=pArgs->converter;
    650     source=pArgs->source;
    651     sourceLimit=pArgs->sourceLimit;
    652     target=(uint8_t *)pArgs->target;
    653     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
    654 
    655     /* get the converter state from UConverter */
    656     c=cnv->fromUChar32;
    657     prev=(int32_t)cnv->fromUnicodeStatus;
    658     if(prev==0) {
    659         prev=BOCU1_ASCII_PREV;
    660     }
    661 
    662     /* conversion loop */
    663     if(c!=0 && targetCapacity>0) {
    664         goto getTrail;
    665     }
    666 
    667 fastSingle:
    668     /* fast loop for single-byte differences */
    669     /* use only one loop counter variable, targetCapacity, not also source */
    670     diff=(int32_t)(sourceLimit-source);
    671     if(targetCapacity>diff) {
    672         targetCapacity=diff;
    673     }
    674     while(targetCapacity>0 && (c=*source)<0x3000) {
    675         if(c<=0x20) {
    676             if(c!=0x20) {
    677                 prev=BOCU1_ASCII_PREV;
    678             }
    679             *target++=(uint8_t)c;
    680         } else {
    681             diff=c-prev;
    682             if(DIFF_IS_SINGLE(diff)) {
    683                 prev=BOCU1_SIMPLE_PREV(c);
    684                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
    685             } else {
    686                 break;
    687             }
    688         }
    689         ++source;
    690         --targetCapacity;
    691     }
    692     /* restore real values */
    693     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
    694 
    695     /* regular loop for all cases */
    696     while(source<sourceLimit) {
    697         if(targetCapacity>0) {
    698             c=*source++;
    699 
    700             if(c<=0x20) {
    701                 /*
    702                  * ISO C0 control & space:
    703                  * Encode directly for MIME compatibility,
    704                  * and reset state except for space, to not disrupt compression.
    705                  */
    706                 if(c!=0x20) {
    707                     prev=BOCU1_ASCII_PREV;
    708                 }
    709                 *target++=(uint8_t)c;
    710                 --targetCapacity;
    711                 continue;
    712             }
    713 
    714             if(UTF_IS_LEAD(c)) {
    715 getTrail:
    716                 if(source<sourceLimit) {
    717                     /* test the following code unit */
    718                     UChar trail=*source;
    719                     if(UTF_IS_SECOND_SURROGATE(trail)) {
    720                         ++source;
    721                         c=UTF16_GET_PAIR_VALUE(c, trail);
    722                     }
    723                 } else {
    724                     /* no more input */
    725                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
    726                     break;
    727                 }
    728             }
    729 
    730             /*
    731              * all other Unicode code points c==U+0021..U+10ffff
    732              * are encoded with the difference c-prev
    733              *
    734              * a new prev is computed from c,
    735              * placed in the middle of a 0x80-block (for most small scripts) or
    736              * in the middle of the Unihan and Hangul blocks
    737              * to statistically minimize the following difference
    738              */
    739             diff=c-prev;
    740             prev=BOCU1_PREV(c);
    741             if(DIFF_IS_SINGLE(diff)) {
    742                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
    743                 --targetCapacity;
    744                 if(c<0x3000) {
    745                     goto fastSingle;
    746                 }
    747             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
    748                 /* optimize 2-byte case */
    749                 int32_t m;
    750 
    751                 if(diff>=0) {
    752                     diff-=BOCU1_REACH_POS_1+1;
    753                     m=diff%BOCU1_TRAIL_COUNT;
    754                     diff/=BOCU1_TRAIL_COUNT;
    755                     diff+=BOCU1_START_POS_2;
    756                 } else {
    757                     diff-=BOCU1_REACH_NEG_1;
    758                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    759                     diff+=BOCU1_START_NEG_2;
    760                 }
    761                 *target++=(uint8_t)diff;
    762                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
    763                 targetCapacity-=2;
    764             } else {
    765                 int32_t length; /* will be 2..4 */
    766 
    767                 diff=packDiff(diff);
    768                 length=BOCU1_LENGTH_FROM_PACKED(diff);
    769 
    770                 /* write the output character bytes from diff and length */
    771                 /* from the first if in the loop we know that targetCapacity>0 */
    772                 if(length<=targetCapacity) {
    773                     switch(length) {
    774                         /* each branch falls through to the next one */
    775                     case 4:
    776                         *target++=(uint8_t)(diff>>24);
    777                     case 3:
    778                         *target++=(uint8_t)(diff>>16);
    779                     /* case 2: handled above */
    780                         *target++=(uint8_t)(diff>>8);
    781                     /* case 1: handled above */
    782                         *target++=(uint8_t)diff;
    783                     default:
    784                         /* will never occur */
    785                         break;
    786                     }
    787                     targetCapacity-=length;
    788                 } else {
    789                     uint8_t *charErrorBuffer;
    790 
    791                     /*
    792                      * We actually do this backwards here:
    793                      * In order to save an intermediate variable, we output
    794                      * first to the overflow buffer what does not fit into the
    795                      * regular target.
    796                      */
    797                     /* we know that 1<=targetCapacity<length<=4 */
    798                     length-=targetCapacity;
    799                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
    800                     switch(length) {
    801                         /* each branch falls through to the next one */
    802                     case 3:
    803                         *charErrorBuffer++=(uint8_t)(diff>>16);
    804                     case 2:
    805                         *charErrorBuffer++=(uint8_t)(diff>>8);
    806                     case 1:
    807                         *charErrorBuffer=(uint8_t)diff;
    808                     default:
    809                         /* will never occur */
    810                         break;
    811                     }
    812                     cnv->charErrorBufferLength=(int8_t)length;
    813 
    814                     /* now output what fits into the regular target */
    815                     diff>>=8*length; /* length was reduced by targetCapacity */
    816                     switch(targetCapacity) {
    817                         /* each branch falls through to the next one */
    818                     case 3:
    819                         *target++=(uint8_t)(diff>>16);
    820                     case 2:
    821                         *target++=(uint8_t)(diff>>8);
    822                     case 1:
    823                         *target++=(uint8_t)diff;
    824                     default:
    825                         /* will never occur */
    826                         break;
    827                     }
    828 
    829                     /* target overflow */
    830                     targetCapacity=0;
    831                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    832                     break;
    833                 }
    834             }
    835         } else {
    836             /* target is full */
    837             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    838             break;
    839         }
    840     }
    841 
    842     /* set the converter state back into UConverter */
    843     cnv->fromUChar32= c<0 ? -c : 0;
    844     cnv->fromUnicodeStatus=(uint32_t)prev;
    845 
    846     /* write back the updated pointers */
    847     pArgs->source=source;
    848     pArgs->target=(char *)target;
    849 }
    850 
    851 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
    852 
    853 /**
    854  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
    855  *
    856  * @param b lead byte;
    857  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
    858  * @return (diff<<2)|count
    859  */
    860 static U_INLINE int32_t
    861 decodeBocu1LeadByte(int32_t b) {
    862     int32_t diff, count;
    863 
    864     if(b>=BOCU1_START_NEG_2) {
    865         /* positive difference */
    866         if(b<BOCU1_START_POS_3) {
    867             /* two bytes */
    868             diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
    869             count=1;
    870         } else if(b<BOCU1_START_POS_4) {
    871             /* three bytes */
    872             diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
    873             count=2;
    874         } else {
    875             /* four bytes */
    876             diff=BOCU1_REACH_POS_3+1;
    877             count=3;
    878         }
    879     } else {
    880         /* negative difference */
    881         if(b>=BOCU1_START_NEG_3) {
    882             /* two bytes */
    883             diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
    884             count=1;
    885         } else if(b>BOCU1_MIN) {
    886             /* three bytes */
    887             diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
    888             count=2;
    889         } else {
    890             /* four bytes */
    891             diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
    892             count=3;
    893         }
    894     }
    895 
    896     /* return the state for decoding the trail byte(s) */
    897     return (diff<<2)|count;
    898 }
    899 
    900 /**
    901  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
    902  *
    903  * @param count number of remaining trail bytes including this one
    904  * @param b trail byte
    905  * @return new delta for diff including b - <0 indicates an error
    906  *
    907  * @see decodeBocu1
    908  */
    909 static U_INLINE int32_t
    910 decodeBocu1TrailByte(int32_t count, int32_t b) {
    911     if(b<=0x20) {
    912         /* skip some C0 controls and make the trail byte range contiguous */
    913         b=bocu1ByteToTrail[b];
    914         /* b<0 for an illegal trail byte value will result in return<0 below */
    915 #if BOCU1_MAX_TRAIL<0xff
    916     } else if(b>BOCU1_MAX_TRAIL) {
    917         return -99;
    918 #endif
    919     } else {
    920         b-=BOCU1_TRAIL_BYTE_OFFSET;
    921     }
    922 
    923     /* add trail byte into difference and decrement count */
    924     if(count==1) {
    925         return b;
    926     } else if(count==2) {
    927         return b*BOCU1_TRAIL_COUNT;
    928     } else /* count==3 */ {
    929         return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
    930     }
    931 }
    932 
    933 static void
    934 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    935                            UErrorCode *pErrorCode) {
    936     UConverter *cnv;
    937     const uint8_t *source, *sourceLimit;
    938     UChar *target;
    939     const UChar *targetLimit;
    940     int32_t *offsets;
    941 
    942     int32_t prev, count, diff, c;
    943 
    944     int8_t byteIndex;
    945     uint8_t *bytes;
    946 
    947     int32_t sourceIndex, nextSourceIndex;
    948 
    949     /* set up the local pointers */
    950     cnv=pArgs->converter;
    951     source=(const uint8_t *)pArgs->source;
    952     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
    953     target=pArgs->target;
    954     targetLimit=pArgs->targetLimit;
    955     offsets=pArgs->offsets;
    956 
    957     /* get the converter state from UConverter */
    958     prev=(int32_t)cnv->toUnicodeStatus;
    959     if(prev==0) {
    960         prev=BOCU1_ASCII_PREV;
    961     }
    962     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
    963     count=diff&3;
    964     diff>>=2;
    965 
    966     byteIndex=cnv->toULength;
    967     bytes=cnv->toUBytes;
    968 
    969     /* sourceIndex=-1 if the current character began in the previous buffer */
    970     sourceIndex=byteIndex==0 ? 0 : -1;
    971     nextSourceIndex=0;
    972 
    973     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
    974     if(count>0 && byteIndex>0 && target<targetLimit) {
    975         goto getTrail;
    976     }
    977 
    978 fastSingle:
    979     /* fast loop for single-byte differences */
    980     /* use count as the only loop counter variable */
    981     diff=(int32_t)(sourceLimit-source);
    982     count=(int32_t)(pArgs->targetLimit-target);
    983     if(count>diff) {
    984         count=diff;
    985     }
    986     while(count>0) {
    987         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
    988             c=prev+(c-BOCU1_MIDDLE);
    989             if(c<0x3000) {
    990                 *target++=(UChar)c;
    991                 *offsets++=nextSourceIndex++;
    992                 prev=BOCU1_SIMPLE_PREV(c);
    993             } else {
    994                 break;
    995             }
    996         } else if(c<=0x20) {
    997             if(c!=0x20) {
    998                 prev=BOCU1_ASCII_PREV;
    999             }
   1000             *target++=(UChar)c;
   1001             *offsets++=nextSourceIndex++;
   1002         } else {
   1003             break;
   1004         }
   1005         ++source;
   1006         --count;
   1007     }
   1008     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
   1009 
   1010     /* decode a sequence of single and lead bytes */
   1011     while(source<sourceLimit) {
   1012         if(target>=targetLimit) {
   1013             /* target is full */
   1014             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1015             break;
   1016         }
   1017 
   1018         ++nextSourceIndex;
   1019         c=*source++;
   1020         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
   1021             /* Write a code point directly from a single-byte difference. */
   1022             c=prev+(c-BOCU1_MIDDLE);
   1023             if(c<0x3000) {
   1024                 *target++=(UChar)c;
   1025                 *offsets++=sourceIndex;
   1026                 prev=BOCU1_SIMPLE_PREV(c);
   1027                 sourceIndex=nextSourceIndex;
   1028                 goto fastSingle;
   1029             }
   1030         } else if(c<=0x20) {
   1031             /*
   1032              * Direct-encoded C0 control code or space.
   1033              * Reset prev for C0 control codes but not for space.
   1034              */
   1035             if(c!=0x20) {
   1036                 prev=BOCU1_ASCII_PREV;
   1037             }
   1038             *target++=(UChar)c;
   1039             *offsets++=sourceIndex;
   1040             sourceIndex=nextSourceIndex;
   1041             continue;
   1042         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
   1043             /* Optimize two-byte case. */
   1044             if(c>=BOCU1_MIDDLE) {
   1045                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
   1046             } else {
   1047                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
   1048             }
   1049 
   1050             /* trail byte */
   1051             ++nextSourceIndex;
   1052             c=decodeBocu1TrailByte(1, *source++);
   1053             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
   1054                 bytes[0]=source[-2];
   1055                 bytes[1]=source[-1];
   1056                 byteIndex=2;
   1057                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1058                 break;
   1059             }
   1060         } else if(c==BOCU1_RESET) {
   1061             /* only reset the state, no code point */
   1062             prev=BOCU1_ASCII_PREV;
   1063             sourceIndex=nextSourceIndex;
   1064             continue;
   1065         } else {
   1066             /*
   1067              * For multi-byte difference lead bytes, set the decoder state
   1068              * with the partial difference value from the lead byte and
   1069              * with the number of trail bytes.
   1070              */
   1071             bytes[0]=(uint8_t)c;
   1072             byteIndex=1;
   1073 
   1074             diff=decodeBocu1LeadByte(c);
   1075             count=diff&3;
   1076             diff>>=2;
   1077 getTrail:
   1078             for(;;) {
   1079                 if(source>=sourceLimit) {
   1080                     goto endloop;
   1081                 }
   1082                 ++nextSourceIndex;
   1083                 c=bytes[byteIndex++]=*source++;
   1084 
   1085                 /* trail byte in any position */
   1086                 c=decodeBocu1TrailByte(count, c);
   1087                 if(c<0) {
   1088                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1089                     goto endloop;
   1090                 }
   1091 
   1092                 diff+=c;
   1093                 if(--count==0) {
   1094                     /* final trail byte, deliver a code point */
   1095                     byteIndex=0;
   1096                     c=prev+diff;
   1097                     if((uint32_t)c>0x10ffff) {
   1098                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1099                         goto endloop;
   1100                     }
   1101                     break;
   1102                 }
   1103             }
   1104         }
   1105 
   1106         /* calculate the next prev and output c */
   1107         prev=BOCU1_PREV(c);
   1108         if(c<=0xffff) {
   1109             *target++=(UChar)c;
   1110             *offsets++=sourceIndex;
   1111         } else {
   1112             /* output surrogate pair */
   1113             *target++=UTF16_LEAD(c);
   1114             if(target<targetLimit) {
   1115                 *target++=UTF16_TRAIL(c);
   1116                 *offsets++=sourceIndex;
   1117                 *offsets++=sourceIndex;
   1118             } else {
   1119                 /* target overflow */
   1120                 *offsets++=sourceIndex;
   1121                 cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c);
   1122                 cnv->UCharErrorBufferLength=1;
   1123                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1124                 break;
   1125             }
   1126         }
   1127         sourceIndex=nextSourceIndex;
   1128     }
   1129 endloop:
   1130 
   1131     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
   1132         /* set the converter state in UConverter to deal with the next character */
   1133         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
   1134         cnv->mode=0;
   1135     } else {
   1136         /* set the converter state back into UConverter */
   1137         cnv->toUnicodeStatus=(uint32_t)prev;
   1138         cnv->mode=(diff<<2)|count;
   1139     }
   1140     cnv->toULength=byteIndex;
   1141 
   1142     /* write back the updated pointers */
   1143     pArgs->source=(const char *)source;
   1144     pArgs->target=target;
   1145     pArgs->offsets=offsets;
   1146     return;
   1147 }
   1148 
   1149 /*
   1150  * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
   1151  * If a change is made in the original function, then either
   1152  * change this function the same way or
   1153  * re-copy the original function and remove the variables
   1154  * offsets, sourceIndex, and nextSourceIndex.
   1155  */
   1156 static void
   1157 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
   1158                 UErrorCode *pErrorCode) {
   1159     UConverter *cnv;
   1160     const uint8_t *source, *sourceLimit;
   1161     UChar *target;
   1162     const UChar *targetLimit;
   1163 
   1164     int32_t prev, count, diff, c;
   1165 
   1166     int8_t byteIndex;
   1167     uint8_t *bytes;
   1168 
   1169 U_ALIGN_CODE(16)
   1170 
   1171     /* set up the local pointers */
   1172     cnv=pArgs->converter;
   1173     source=(const uint8_t *)pArgs->source;
   1174     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   1175     target=pArgs->target;
   1176     targetLimit=pArgs->targetLimit;
   1177 
   1178     /* get the converter state from UConverter */
   1179     prev=(int32_t)cnv->toUnicodeStatus;
   1180     if(prev==0) {
   1181         prev=BOCU1_ASCII_PREV;
   1182     }
   1183     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
   1184     count=diff&3;
   1185     diff>>=2;
   1186 
   1187     byteIndex=cnv->toULength;
   1188     bytes=cnv->toUBytes;
   1189 
   1190     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
   1191     if(count>0 && byteIndex>0 && target<targetLimit) {
   1192         goto getTrail;
   1193     }
   1194 
   1195 fastSingle:
   1196     /* fast loop for single-byte differences */
   1197     /* use count as the only loop counter variable */
   1198     diff=(int32_t)(sourceLimit-source);
   1199     count=(int32_t)(pArgs->targetLimit-target);
   1200     if(count>diff) {
   1201         count=diff;
   1202     }
   1203     while(count>0) {
   1204         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
   1205             c=prev+(c-BOCU1_MIDDLE);
   1206             if(c<0x3000) {
   1207                 *target++=(UChar)c;
   1208                 prev=BOCU1_SIMPLE_PREV(c);
   1209             } else {
   1210                 break;
   1211             }
   1212         } else if(c<=0x20) {
   1213             if(c!=0x20) {
   1214                 prev=BOCU1_ASCII_PREV;
   1215             }
   1216             *target++=(UChar)c;
   1217         } else {
   1218             break;
   1219         }
   1220         ++source;
   1221         --count;
   1222     }
   1223 
   1224     /* decode a sequence of single and lead bytes */
   1225     while(source<sourceLimit) {
   1226         if(target>=targetLimit) {
   1227             /* target is full */
   1228             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1229             break;
   1230         }
   1231 
   1232         c=*source++;
   1233         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
   1234             /* Write a code point directly from a single-byte difference. */
   1235             c=prev+(c-BOCU1_MIDDLE);
   1236             if(c<0x3000) {
   1237                 *target++=(UChar)c;
   1238                 prev=BOCU1_SIMPLE_PREV(c);
   1239                 goto fastSingle;
   1240             }
   1241         } else if(c<=0x20) {
   1242             /*
   1243              * Direct-encoded C0 control code or space.
   1244              * Reset prev for C0 control codes but not for space.
   1245              */
   1246             if(c!=0x20) {
   1247                 prev=BOCU1_ASCII_PREV;
   1248             }
   1249             *target++=(UChar)c;
   1250             continue;
   1251         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
   1252             /* Optimize two-byte case. */
   1253             if(c>=BOCU1_MIDDLE) {
   1254                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
   1255             } else {
   1256                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
   1257             }
   1258 
   1259             /* trail byte */
   1260             c=decodeBocu1TrailByte(1, *source++);
   1261             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
   1262                 bytes[0]=source[-2];
   1263                 bytes[1]=source[-1];
   1264                 byteIndex=2;
   1265                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1266                 break;
   1267             }
   1268         } else if(c==BOCU1_RESET) {
   1269             /* only reset the state, no code point */
   1270             prev=BOCU1_ASCII_PREV;
   1271             continue;
   1272         } else {
   1273             /*
   1274              * For multi-byte difference lead bytes, set the decoder state
   1275              * with the partial difference value from the lead byte and
   1276              * with the number of trail bytes.
   1277              */
   1278             bytes[0]=(uint8_t)c;
   1279             byteIndex=1;
   1280 
   1281             diff=decodeBocu1LeadByte(c);
   1282             count=diff&3;
   1283             diff>>=2;
   1284 getTrail:
   1285             for(;;) {
   1286                 if(source>=sourceLimit) {
   1287                     goto endloop;
   1288                 }
   1289                 c=bytes[byteIndex++]=*source++;
   1290 
   1291                 /* trail byte in any position */
   1292                 c=decodeBocu1TrailByte(count, c);
   1293                 if(c<0) {
   1294                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1295                     goto endloop;
   1296                 }
   1297 
   1298                 diff+=c;
   1299                 if(--count==0) {
   1300                     /* final trail byte, deliver a code point */
   1301                     byteIndex=0;
   1302                     c=prev+diff;
   1303                     if((uint32_t)c>0x10ffff) {
   1304                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1305                         goto endloop;
   1306                     }
   1307                     break;
   1308                 }
   1309             }
   1310         }
   1311 
   1312         /* calculate the next prev and output c */
   1313         prev=BOCU1_PREV(c);
   1314         if(c<=0xffff) {
   1315             *target++=(UChar)c;
   1316         } else {
   1317             /* output surrogate pair */
   1318             *target++=UTF16_LEAD(c);
   1319             if(target<targetLimit) {
   1320                 *target++=UTF16_TRAIL(c);
   1321             } else {
   1322                 /* target overflow */
   1323                 cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c);
   1324                 cnv->UCharErrorBufferLength=1;
   1325                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1326                 break;
   1327             }
   1328         }
   1329     }
   1330 endloop:
   1331 
   1332     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
   1333         /* set the converter state in UConverter to deal with the next character */
   1334         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
   1335         cnv->mode=0;
   1336     } else {
   1337         /* set the converter state back into UConverter */
   1338         cnv->toUnicodeStatus=(uint32_t)prev;
   1339         cnv->mode=(diff<<2)|count;
   1340     }
   1341     cnv->toULength=byteIndex;
   1342 
   1343     /* write back the updated pointers */
   1344     pArgs->source=(const char *)source;
   1345     pArgs->target=target;
   1346     return;
   1347 }
   1348 
   1349 /* miscellaneous ------------------------------------------------------------ */
   1350 
   1351 static const UConverterImpl _Bocu1Impl={
   1352     UCNV_BOCU1,
   1353 
   1354     NULL,
   1355     NULL,
   1356 
   1357     NULL,
   1358     NULL,
   1359     NULL,
   1360 
   1361     _Bocu1ToUnicode,
   1362     _Bocu1ToUnicodeWithOffsets,
   1363     _Bocu1FromUnicode,
   1364     _Bocu1FromUnicodeWithOffsets,
   1365     NULL,
   1366 
   1367     NULL,
   1368     NULL,
   1369     NULL,
   1370     NULL,
   1371     ucnv_getCompleteUnicodeSet
   1372 };
   1373 
   1374 static const UConverterStaticData _Bocu1StaticData={
   1375     sizeof(UConverterStaticData),
   1376     "BOCU-1",
   1377     1214, /* CCSID for BOCU-1 */
   1378     UCNV_IBM, UCNV_BOCU1,
   1379     1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
   1380     { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
   1381     FALSE, FALSE,
   1382     0,
   1383     0,
   1384     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1385 };
   1386 
   1387 const UConverterSharedData _Bocu1Data={
   1388     sizeof(UConverterSharedData), ~((uint32_t)0),
   1389     NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,
   1390     0
   1391 };
   1392 
   1393 #endif
   1394