Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2002-2011, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  ucnvbocu.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002mar27
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This is an implementation of the Binary Ordered Compression for Unicode,
     17 *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 
     22 #if !UCONFIG_NO_CONVERSION
     23 
     24 #include "unicode/ucnv.h"
     25 #include "unicode/ucnv_cb.h"
     26 #include "unicode/utf16.h"
     27 #include "putilimp.h"
     28 #include "ucnv_bld.h"
     29 #include "ucnv_cnv.h"
     30 #include "uassert.h"
     31 
     32 /* BOCU-1 constants and macros ---------------------------------------------- */
     33 
     34 /*
     35  * BOCU-1 encodes the code points of a Unicode string as
     36  * a sequence of byte-encoded differences (slope detection),
     37  * preserving lexical order.
     38  *
     39  * Optimize the difference-taking for runs of Unicode text within
     40  * small scripts:
     41  *
     42  * Most small scripts are allocated within aligned 128-blocks of Unicode
     43  * code points. Lexical order is preserved if the "previous code point" state
     44  * is always moved into the middle of such a block.
     45  *
     46  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
     47  * areas into the middle of those areas.
     48  *
     49  * C0 control codes and space are encoded with their US-ASCII bytes.
     50  * "prev" is reset for C0 controls but not for space.
     51  */
     52 
     53 /* initial value for "prev": middle of the ASCII range */
     54 #define BOCU1_ASCII_PREV        0x40
     55 
     56 /* bounding byte values for differences */
     57 #define BOCU1_MIN               0x21
     58 #define BOCU1_MIDDLE            0x90
     59 #define BOCU1_MAX_LEAD          0xfe
     60 #define BOCU1_MAX_TRAIL         0xff
     61 #define BOCU1_RESET             0xff
     62 
     63 /* number of lead bytes */
     64 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
     65 
     66 /* adjust trail byte counts for the use of some C0 control byte values */
     67 #define BOCU1_TRAIL_CONTROLS_COUNT  20
     68 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
     69 
     70 /* number of trail bytes */
     71 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
     72 
     73 /*
     74  * number of positive and negative single-byte codes
     75  * (counting 0==BOCU1_MIDDLE among the positive ones)
     76  */
     77 #define BOCU1_SINGLE            64
     78 
     79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
     80 #define BOCU1_LEAD_2            43
     81 #define BOCU1_LEAD_3            3
     82 #define BOCU1_LEAD_4            1
     83 
     84 /* The difference value range for single-byters. */
     85 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
     86 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
     87 
     88 /* The difference value range for double-byters. */
     89 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
     90 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
     91 
     92 /* The difference value range for 3-byters. */
     93 #define BOCU1_REACH_POS_3   \
     94     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
     95 
     96 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
     97 
     98 /* The lead byte start values. */
     99 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
    100 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
    101 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
    102      /* ==BOCU1_MAX_LEAD */
    103 
    104 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
    105 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
    106 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
    107      /* ==BOCU1_MIN+1 */
    108 
    109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
    110 #define BOCU1_LENGTH_FROM_LEAD(lead) \
    111     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
    112      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
    113      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
    114 
    115 /* The length of a byte sequence, according to its packed form. */
    116 #define BOCU1_LENGTH_FROM_PACKED(packed) \
    117     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
    118 
    119 /*
    120  * 12 commonly used C0 control codes (and space) are only used to encode
    121  * themselves directly,
    122  * which makes BOCU-1 MIME-usable and reasonably safe for
    123  * ASCII-oriented software.
    124  *
    125  * These controls are
    126  *  0   NUL
    127  *
    128  *  7   BEL
    129  *  8   BS
    130  *
    131  *  9   TAB
    132  *  a   LF
    133  *  b   VT
    134  *  c   FF
    135  *  d   CR
    136  *
    137  *  e   SO
    138  *  f   SI
    139  *
    140  * 1a   SUB
    141  * 1b   ESC
    142  *
    143  * The other 20 C0 controls are also encoded directly (to preserve order)
    144  * but are also used as trail bytes in difference encoding
    145  * (for better compression).
    146  */
    147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
    148 
    149 /*
    150  * Byte value map for control codes,
    151  * from external byte values 0x00..0x20
    152  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
    153  * External byte values that are illegal as trail bytes are mapped to -1.
    154  */
    155 static const int8_t
    156 bocu1ByteToTrail[BOCU1_MIN]={
    157 /*  0     1     2     3     4     5     6     7    */
    158     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
    159 
    160 /*  8     9     a     b     c     d     e     f    */
    161     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
    162 
    163 /*  10    11    12    13    14    15    16    17   */
    164     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
    165 
    166 /*  18    19    1a    1b    1c    1d    1e    1f   */
    167     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
    168 
    169 /*  20   */
    170     -1
    171 };
    172 
    173 /*
    174  * Byte value map for control codes,
    175  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
    176  * to external byte values 0x00..0x20.
    177  */
    178 static const int8_t
    179 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
    180 /*  0     1     2     3     4     5     6     7    */
    181     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
    182 
    183 /*  8     9     a     b     c     d     e     f    */
    184     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
    185 
    186 /*  10    11    12    13   */
    187     0x1c, 0x1d, 0x1e, 0x1f
    188 };
    189 
    190 /**
    191  * Integer division and modulo with negative numerators
    192  * yields negative modulo results and quotients that are one more than
    193  * what we need here.
    194  * This macro adjust the results so that the modulo-value m is always >=0.
    195  *
    196  * For positive n, the if() condition is always FALSE.
    197  *
    198  * @param n Number to be split into quotient and rest.
    199  *          Will be modified to contain the quotient.
    200  * @param d Divisor.
    201  * @param m Output variable for the rest (modulo result).
    202  */
    203 #define NEGDIVMOD(n, d, m) { \
    204     (m)=(n)%(d); \
    205     (n)/=(d); \
    206     if((m)<0) { \
    207         --(n); \
    208         (m)+=(d); \
    209     } \
    210 }
    211 
    212 /* Faster versions of packDiff() for single-byte-encoded diff values. */
    213 
    214 /** Is a diff value encodable in a single byte? */
    215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
    216 
    217 /** Encode a diff value in a single byte. */
    218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
    219 
    220 /** Is a diff value encodable in two bytes? */
    221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
    222 
    223 /* BOCU-1 implementation functions ------------------------------------------ */
    224 
    225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
    226 
    227 /**
    228  * Compute the next "previous" value for differencing
    229  * from the current code point.
    230  *
    231  * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
    232  * @return "previous code point" state value
    233  */
    234 static inline int32_t
    235 bocu1Prev(int32_t c) {
    236     /* compute new prev */
    237     if(/* 0x3040<=c && */ c<=0x309f) {
    238         /* Hiragana is not 128-aligned */
    239         return 0x3070;
    240     } else if(0x4e00<=c && c<=0x9fa5) {
    241         /* CJK Unihan */
    242         return 0x4e00-BOCU1_REACH_NEG_2;
    243     } else if(0xac00<=c /* && c<=0xd7a3 */) {
    244         /* Korean Hangul */
    245         return (0xd7a3+0xac00)/2;
    246     } else {
    247         /* mostly small scripts */
    248         return BOCU1_SIMPLE_PREV(c);
    249     }
    250 }
    251 
    252 /** Fast version of bocu1Prev() for most scripts. */
    253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
    254 
    255 /*
    256  * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
    257  * The UConverter fields are used as follows:
    258  *
    259  * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
    260  *
    261  * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
    262  * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
    263  */
    264 
    265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
    266 
    267 /**
    268  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
    269  * and return a packed integer with them.
    270  *
    271  * The encoding favors small absolute differences with short encodings
    272  * to compress runs of same-script characters.
    273  *
    274  * Optimized version with unrolled loops and fewer floating-point operations
    275  * than the standard packDiff().
    276  *
    277  * @param diff difference value -0x10ffff..0x10ffff
    278  * @return
    279  *      0x010000zz for 1-byte sequence zz
    280  *      0x0200yyzz for 2-byte sequence yy zz
    281  *      0x03xxyyzz for 3-byte sequence xx yy zz
    282  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
    283  */
    284 static int32_t
    285 packDiff(int32_t diff) {
    286     int32_t result, m;
    287 
    288     U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
    289     if(diff>=BOCU1_REACH_NEG_1) {
    290         /* mostly positive differences, and single-byte negative ones */
    291 #if 0   /* single-byte case handled in macros, see below */
    292         if(diff<=BOCU1_REACH_POS_1) {
    293             /* single byte */
    294             return 0x01000000|(BOCU1_MIDDLE+diff);
    295         } else
    296 #endif
    297         if(diff<=BOCU1_REACH_POS_2) {
    298             /* two bytes */
    299             diff-=BOCU1_REACH_POS_1+1;
    300             result=0x02000000;
    301 
    302             m=diff%BOCU1_TRAIL_COUNT;
    303             diff/=BOCU1_TRAIL_COUNT;
    304             result|=BOCU1_TRAIL_TO_BYTE(m);
    305 
    306             result|=(BOCU1_START_POS_2+diff)<<8;
    307         } else if(diff<=BOCU1_REACH_POS_3) {
    308             /* three bytes */
    309             diff-=BOCU1_REACH_POS_2+1;
    310             result=0x03000000;
    311 
    312             m=diff%BOCU1_TRAIL_COUNT;
    313             diff/=BOCU1_TRAIL_COUNT;
    314             result|=BOCU1_TRAIL_TO_BYTE(m);
    315 
    316             m=diff%BOCU1_TRAIL_COUNT;
    317             diff/=BOCU1_TRAIL_COUNT;
    318             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    319 
    320             result|=(BOCU1_START_POS_3+diff)<<16;
    321         } else {
    322             /* four bytes */
    323             diff-=BOCU1_REACH_POS_3+1;
    324 
    325             m=diff%BOCU1_TRAIL_COUNT;
    326             diff/=BOCU1_TRAIL_COUNT;
    327             result=BOCU1_TRAIL_TO_BYTE(m);
    328 
    329             m=diff%BOCU1_TRAIL_COUNT;
    330             diff/=BOCU1_TRAIL_COUNT;
    331             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    332 
    333             /*
    334              * We know that / and % would deliver quotient 0 and rest=diff.
    335              * Avoid division and modulo for performance.
    336              */
    337             result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
    338 
    339             result|=((uint32_t)BOCU1_START_POS_4)<<24;
    340         }
    341     } else {
    342         /* two- to four-byte negative differences */
    343         if(diff>=BOCU1_REACH_NEG_2) {
    344             /* two bytes */
    345             diff-=BOCU1_REACH_NEG_1;
    346             result=0x02000000;
    347 
    348             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    349             result|=BOCU1_TRAIL_TO_BYTE(m);
    350 
    351             result|=(BOCU1_START_NEG_2+diff)<<8;
    352         } else if(diff>=BOCU1_REACH_NEG_3) {
    353             /* three bytes */
    354             diff-=BOCU1_REACH_NEG_2;
    355             result=0x03000000;
    356 
    357             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    358             result|=BOCU1_TRAIL_TO_BYTE(m);
    359 
    360             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    361             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    362 
    363             result|=(BOCU1_START_NEG_3+diff)<<16;
    364         } else {
    365             /* four bytes */
    366             diff-=BOCU1_REACH_NEG_3;
    367 
    368             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    369             result=BOCU1_TRAIL_TO_BYTE(m);
    370 
    371             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    372             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    373 
    374             /*
    375              * We know that NEGDIVMOD would deliver
    376              * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
    377              * Avoid division and modulo for performance.
    378              */
    379             m=diff+BOCU1_TRAIL_COUNT;
    380             result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
    381 
    382             result|=BOCU1_MIN<<24;
    383         }
    384     }
    385     return result;
    386 }
    387 
    388 
    389 static void
    390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
    391                              UErrorCode *pErrorCode) {
    392     UConverter *cnv;
    393     const UChar *source, *sourceLimit;
    394     uint8_t *target;
    395     int32_t targetCapacity;
    396     int32_t *offsets;
    397 
    398     int32_t prev, c, diff;
    399 
    400     int32_t sourceIndex, nextSourceIndex;
    401 
    402 U_ALIGN_CODE(16)
    403 
    404     /* set up the local pointers */
    405     cnv=pArgs->converter;
    406     source=pArgs->source;
    407     sourceLimit=pArgs->sourceLimit;
    408     target=(uint8_t *)pArgs->target;
    409     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
    410     offsets=pArgs->offsets;
    411 
    412     /* get the converter state from UConverter */
    413     c=cnv->fromUChar32;
    414     prev=(int32_t)cnv->fromUnicodeStatus;
    415     if(prev==0) {
    416         prev=BOCU1_ASCII_PREV;
    417     }
    418 
    419     /* sourceIndex=-1 if the current character began in the previous buffer */
    420     sourceIndex= c==0 ? 0 : -1;
    421     nextSourceIndex=0;
    422 
    423     /* conversion loop */
    424     if(c!=0 && targetCapacity>0) {
    425         goto getTrail;
    426     }
    427 
    428 fastSingle:
    429     /* fast loop for single-byte differences */
    430     /* use only one loop counter variable, targetCapacity, not also source */
    431     diff=(int32_t)(sourceLimit-source);
    432     if(targetCapacity>diff) {
    433         targetCapacity=diff;
    434     }
    435     while(targetCapacity>0 && (c=*source)<0x3000) {
    436         if(c<=0x20) {
    437             if(c!=0x20) {
    438                 prev=BOCU1_ASCII_PREV;
    439             }
    440             *target++=(uint8_t)c;
    441             *offsets++=nextSourceIndex++;
    442             ++source;
    443             --targetCapacity;
    444         } else {
    445             diff=c-prev;
    446             if(DIFF_IS_SINGLE(diff)) {
    447                 prev=BOCU1_SIMPLE_PREV(c);
    448                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
    449                 *offsets++=nextSourceIndex++;
    450                 ++source;
    451                 --targetCapacity;
    452             } else {
    453                 break;
    454             }
    455         }
    456     }
    457     /* restore real values */
    458     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
    459     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
    460 
    461     /* regular loop for all cases */
    462     while(source<sourceLimit) {
    463         if(targetCapacity>0) {
    464             c=*source++;
    465             ++nextSourceIndex;
    466 
    467             if(c<=0x20) {
    468                 /*
    469                  * ISO C0 control & space:
    470                  * Encode directly for MIME compatibility,
    471                  * and reset state except for space, to not disrupt compression.
    472                  */
    473                 if(c!=0x20) {
    474                     prev=BOCU1_ASCII_PREV;
    475                 }
    476                 *target++=(uint8_t)c;
    477                 *offsets++=sourceIndex;
    478                 --targetCapacity;
    479 
    480                 sourceIndex=nextSourceIndex;
    481                 continue;
    482             }
    483 
    484             if(U16_IS_LEAD(c)) {
    485 getTrail:
    486                 if(source<sourceLimit) {
    487                     /* test the following code unit */
    488                     UChar trail=*source;
    489                     if(U16_IS_TRAIL(trail)) {
    490                         ++source;
    491                         ++nextSourceIndex;
    492                         c=U16_GET_SUPPLEMENTARY(c, trail);
    493                     }
    494                 } else {
    495                     /* no more input */
    496                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
    497                     break;
    498                 }
    499             }
    500 
    501             /*
    502              * all other Unicode code points c==U+0021..U+10ffff
    503              * are encoded with the difference c-prev
    504              *
    505              * a new prev is computed from c,
    506              * placed in the middle of a 0x80-block (for most small scripts) or
    507              * in the middle of the Unihan and Hangul blocks
    508              * to statistically minimize the following difference
    509              */
    510             diff=c-prev;
    511             prev=BOCU1_PREV(c);
    512             if(DIFF_IS_SINGLE(diff)) {
    513                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
    514                 *offsets++=sourceIndex;
    515                 --targetCapacity;
    516                 sourceIndex=nextSourceIndex;
    517                 if(c<0x3000) {
    518                     goto fastSingle;
    519                 }
    520             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
    521                 /* optimize 2-byte case */
    522                 int32_t m;
    523 
    524                 if(diff>=0) {
    525                     diff-=BOCU1_REACH_POS_1+1;
    526                     m=diff%BOCU1_TRAIL_COUNT;
    527                     diff/=BOCU1_TRAIL_COUNT;
    528                     diff+=BOCU1_START_POS_2;
    529                 } else {
    530                     diff-=BOCU1_REACH_NEG_1;
    531                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    532                     diff+=BOCU1_START_NEG_2;
    533                 }
    534                 *target++=(uint8_t)diff;
    535                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
    536                 *offsets++=sourceIndex;
    537                 *offsets++=sourceIndex;
    538                 targetCapacity-=2;
    539                 sourceIndex=nextSourceIndex;
    540             } else {
    541                 int32_t length; /* will be 2..4 */
    542 
    543                 diff=packDiff(diff);
    544                 length=BOCU1_LENGTH_FROM_PACKED(diff);
    545 
    546                 /* write the output character bytes from diff and length */
    547                 /* from the first if in the loop we know that targetCapacity>0 */
    548                 if(length<=targetCapacity) {
    549                     switch(length) {
    550                         /* each branch falls through to the next one */
    551                     case 4:
    552                         *target++=(uint8_t)(diff>>24);
    553                         *offsets++=sourceIndex;
    554                     case 3: /*fall through*/
    555                         *target++=(uint8_t)(diff>>16);
    556                         *offsets++=sourceIndex;
    557                     case 2: /*fall through*/
    558                         *target++=(uint8_t)(diff>>8);
    559                         *offsets++=sourceIndex;
    560                     /* case 1: handled above */
    561                         *target++=(uint8_t)diff;
    562                         *offsets++=sourceIndex;
    563                     default:
    564                         /* will never occur */
    565                         break;
    566                     }
    567                     targetCapacity-=length;
    568                     sourceIndex=nextSourceIndex;
    569                 } else {
    570                     uint8_t *charErrorBuffer;
    571 
    572                     /*
    573                      * We actually do this backwards here:
    574                      * In order to save an intermediate variable, we output
    575                      * first to the overflow buffer what does not fit into the
    576                      * regular target.
    577                      */
    578                     /* we know that 1<=targetCapacity<length<=4 */
    579                     length-=targetCapacity;
    580                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
    581                     switch(length) {
    582                         /* each branch falls through to the next one */
    583                     case 3:
    584                         *charErrorBuffer++=(uint8_t)(diff>>16);
    585                     case 2: /*fall through*/
    586                         *charErrorBuffer++=(uint8_t)(diff>>8);
    587                     case 1: /*fall through*/
    588                         *charErrorBuffer=(uint8_t)diff;
    589                     default:
    590                         /* will never occur */
    591                         break;
    592                     }
    593                     cnv->charErrorBufferLength=(int8_t)length;
    594 
    595                     /* now output what fits into the regular target */
    596                     diff>>=8*length; /* length was reduced by targetCapacity */
    597                     switch(targetCapacity) {
    598                         /* each branch falls through to the next one */
    599                     case 3:
    600                         *target++=(uint8_t)(diff>>16);
    601                         *offsets++=sourceIndex;
    602                     case 2: /*fall through*/
    603                         *target++=(uint8_t)(diff>>8);
    604                         *offsets++=sourceIndex;
    605                     case 1: /*fall through*/
    606                         *target++=(uint8_t)diff;
    607                         *offsets++=sourceIndex;
    608                     default:
    609                         /* will never occur */
    610                         break;
    611                     }
    612 
    613                     /* target overflow */
    614                     targetCapacity=0;
    615                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    616                     break;
    617                 }
    618             }
    619         } else {
    620             /* target is full */
    621             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    622             break;
    623         }
    624     }
    625 
    626     /* set the converter state back into UConverter */
    627     cnv->fromUChar32= c<0 ? -c : 0;
    628     cnv->fromUnicodeStatus=(uint32_t)prev;
    629 
    630     /* write back the updated pointers */
    631     pArgs->source=source;
    632     pArgs->target=(char *)target;
    633     pArgs->offsets=offsets;
    634 }
    635 
    636 /*
    637  * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
    638  * If a change is made in the original function, then either
    639  * change this function the same way or
    640  * re-copy the original function and remove the variables
    641  * offsets, sourceIndex, and nextSourceIndex.
    642  */
    643 static void
    644 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
    645                   UErrorCode *pErrorCode) {
    646     UConverter *cnv;
    647     const UChar *source, *sourceLimit;
    648     uint8_t *target;
    649     int32_t targetCapacity;
    650 
    651     int32_t prev, c, diff;
    652 
    653     /* set up the local pointers */
    654     cnv=pArgs->converter;
    655     source=pArgs->source;
    656     sourceLimit=pArgs->sourceLimit;
    657     target=(uint8_t *)pArgs->target;
    658     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
    659 
    660     /* get the converter state from UConverter */
    661     c=cnv->fromUChar32;
    662     prev=(int32_t)cnv->fromUnicodeStatus;
    663     if(prev==0) {
    664         prev=BOCU1_ASCII_PREV;
    665     }
    666 
    667     /* conversion loop */
    668     if(c!=0 && targetCapacity>0) {
    669         goto getTrail;
    670     }
    671 
    672 fastSingle:
    673     /* fast loop for single-byte differences */
    674     /* use only one loop counter variable, targetCapacity, not also source */
    675     diff=(int32_t)(sourceLimit-source);
    676     if(targetCapacity>diff) {
    677         targetCapacity=diff;
    678     }
    679     while(targetCapacity>0 && (c=*source)<0x3000) {
    680         if(c<=0x20) {
    681             if(c!=0x20) {
    682                 prev=BOCU1_ASCII_PREV;
    683             }
    684             *target++=(uint8_t)c;
    685         } else {
    686             diff=c-prev;
    687             if(DIFF_IS_SINGLE(diff)) {
    688                 prev=BOCU1_SIMPLE_PREV(c);
    689                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
    690             } else {
    691                 break;
    692             }
    693         }
    694         ++source;
    695         --targetCapacity;
    696     }
    697     /* restore real values */
    698     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
    699 
    700     /* regular loop for all cases */
    701     while(source<sourceLimit) {
    702         if(targetCapacity>0) {
    703             c=*source++;
    704 
    705             if(c<=0x20) {
    706                 /*
    707                  * ISO C0 control & space:
    708                  * Encode directly for MIME compatibility,
    709                  * and reset state except for space, to not disrupt compression.
    710                  */
    711                 if(c!=0x20) {
    712                     prev=BOCU1_ASCII_PREV;
    713                 }
    714                 *target++=(uint8_t)c;
    715                 --targetCapacity;
    716                 continue;
    717             }
    718 
    719             if(U16_IS_LEAD(c)) {
    720 getTrail:
    721                 if(source<sourceLimit) {
    722                     /* test the following code unit */
    723                     UChar trail=*source;
    724                     if(U16_IS_TRAIL(trail)) {
    725                         ++source;
    726                         c=U16_GET_SUPPLEMENTARY(c, trail);
    727                     }
    728                 } else {
    729                     /* no more input */
    730                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
    731                     break;
    732                 }
    733             }
    734 
    735             /*
    736              * all other Unicode code points c==U+0021..U+10ffff
    737              * are encoded with the difference c-prev
    738              *
    739              * a new prev is computed from c,
    740              * placed in the middle of a 0x80-block (for most small scripts) or
    741              * in the middle of the Unihan and Hangul blocks
    742              * to statistically minimize the following difference
    743              */
    744             diff=c-prev;
    745             prev=BOCU1_PREV(c);
    746             if(DIFF_IS_SINGLE(diff)) {
    747                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
    748                 --targetCapacity;
    749                 if(c<0x3000) {
    750                     goto fastSingle;
    751                 }
    752             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
    753                 /* optimize 2-byte case */
    754                 int32_t m;
    755 
    756                 if(diff>=0) {
    757                     diff-=BOCU1_REACH_POS_1+1;
    758                     m=diff%BOCU1_TRAIL_COUNT;
    759                     diff/=BOCU1_TRAIL_COUNT;
    760                     diff+=BOCU1_START_POS_2;
    761                 } else {
    762                     diff-=BOCU1_REACH_NEG_1;
    763                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    764                     diff+=BOCU1_START_NEG_2;
    765                 }
    766                 *target++=(uint8_t)diff;
    767                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
    768                 targetCapacity-=2;
    769             } else {
    770                 int32_t length; /* will be 2..4 */
    771 
    772                 diff=packDiff(diff);
    773                 length=BOCU1_LENGTH_FROM_PACKED(diff);
    774 
    775                 /* write the output character bytes from diff and length */
    776                 /* from the first if in the loop we know that targetCapacity>0 */
    777                 if(length<=targetCapacity) {
    778                     switch(length) {
    779                         /* each branch falls through to the next one */
    780                     case 4:
    781                         *target++=(uint8_t)(diff>>24);
    782                     case 3: /*fall through*/
    783                         *target++=(uint8_t)(diff>>16);
    784                     /* case 2: handled above */
    785                         *target++=(uint8_t)(diff>>8);
    786                     /* case 1: handled above */
    787                         *target++=(uint8_t)diff;
    788                     default:
    789                         /* will never occur */
    790                         break;
    791                     }
    792                     targetCapacity-=length;
    793                 } else {
    794                     uint8_t *charErrorBuffer;
    795 
    796                     /*
    797                      * We actually do this backwards here:
    798                      * In order to save an intermediate variable, we output
    799                      * first to the overflow buffer what does not fit into the
    800                      * regular target.
    801                      */
    802                     /* we know that 1<=targetCapacity<length<=4 */
    803                     length-=targetCapacity;
    804                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
    805                     switch(length) {
    806                         /* each branch falls through to the next one */
    807                     case 3:
    808                         *charErrorBuffer++=(uint8_t)(diff>>16);
    809                     case 2: /*fall through*/
    810                         *charErrorBuffer++=(uint8_t)(diff>>8);
    811                     case 1: /*fall through*/
    812                         *charErrorBuffer=(uint8_t)diff;
    813                     default:
    814                         /* will never occur */
    815                         break;
    816                     }
    817                     cnv->charErrorBufferLength=(int8_t)length;
    818 
    819                     /* now output what fits into the regular target */
    820                     diff>>=8*length; /* length was reduced by targetCapacity */
    821                     switch(targetCapacity) {
    822                         /* each branch falls through to the next one */
    823                     case 3:
    824                         *target++=(uint8_t)(diff>>16);
    825                     case 2: /*fall through*/
    826                         *target++=(uint8_t)(diff>>8);
    827                     case 1: /*fall through*/
    828                         *target++=(uint8_t)diff;
    829                     default:
    830                         /* will never occur */
    831                         break;
    832                     }
    833 
    834                     /* target overflow */
    835                     targetCapacity=0;
    836                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    837                     break;
    838                 }
    839             }
    840         } else {
    841             /* target is full */
    842             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    843             break;
    844         }
    845     }
    846 
    847     /* set the converter state back into UConverter */
    848     cnv->fromUChar32= c<0 ? -c : 0;
    849     cnv->fromUnicodeStatus=(uint32_t)prev;
    850 
    851     /* write back the updated pointers */
    852     pArgs->source=source;
    853     pArgs->target=(char *)target;
    854 }
    855 
    856 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
    857 
    858 /**
    859  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
    860  *
    861  * @param b lead byte;
    862  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
    863  * @return (diff<<2)|count
    864  */
    865 static inline int32_t
    866 decodeBocu1LeadByte(int32_t b) {
    867     int32_t diff, count;
    868 
    869     if(b>=BOCU1_START_NEG_2) {
    870         /* positive difference */
    871         if(b<BOCU1_START_POS_3) {
    872             /* two bytes */
    873             diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
    874             count=1;
    875         } else if(b<BOCU1_START_POS_4) {
    876             /* three bytes */
    877             diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
    878             count=2;
    879         } else {
    880             /* four bytes */
    881             diff=BOCU1_REACH_POS_3+1;
    882             count=3;
    883         }
    884     } else {
    885         /* negative difference */
    886         if(b>=BOCU1_START_NEG_3) {
    887             /* two bytes */
    888             diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
    889             count=1;
    890         } else if(b>BOCU1_MIN) {
    891             /* three bytes */
    892             diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
    893             count=2;
    894         } else {
    895             /* four bytes */
    896             diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
    897             count=3;
    898         }
    899     }
    900 
    901     /* return the state for decoding the trail byte(s) */
    902     return (diff<<2)|count;
    903 }
    904 
    905 /**
    906  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
    907  *
    908  * @param count number of remaining trail bytes including this one
    909  * @param b trail byte
    910  * @return new delta for diff including b - <0 indicates an error
    911  *
    912  * @see decodeBocu1
    913  */
    914 static inline int32_t
    915 decodeBocu1TrailByte(int32_t count, int32_t b) {
    916     if(b<=0x20) {
    917         /* skip some C0 controls and make the trail byte range contiguous */
    918         b=bocu1ByteToTrail[b];
    919         /* b<0 for an illegal trail byte value will result in return<0 below */
    920 #if BOCU1_MAX_TRAIL<0xff
    921     } else if(b>BOCU1_MAX_TRAIL) {
    922         return -99;
    923 #endif
    924     } else {
    925         b-=BOCU1_TRAIL_BYTE_OFFSET;
    926     }
    927 
    928     /* add trail byte into difference and decrement count */
    929     if(count==1) {
    930         return b;
    931     } else if(count==2) {
    932         return b*BOCU1_TRAIL_COUNT;
    933     } else /* count==3 */ {
    934         return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
    935     }
    936 }
    937 
    938 static void
    939 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    940                            UErrorCode *pErrorCode) {
    941     UConverter *cnv;
    942     const uint8_t *source, *sourceLimit;
    943     UChar *target;
    944     const UChar *targetLimit;
    945     int32_t *offsets;
    946 
    947     int32_t prev, count, diff, c;
    948 
    949     int8_t byteIndex;
    950     uint8_t *bytes;
    951 
    952     int32_t sourceIndex, nextSourceIndex;
    953 
    954     /* set up the local pointers */
    955     cnv=pArgs->converter;
    956     source=(const uint8_t *)pArgs->source;
    957     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
    958     target=pArgs->target;
    959     targetLimit=pArgs->targetLimit;
    960     offsets=pArgs->offsets;
    961 
    962     /* get the converter state from UConverter */
    963     prev=(int32_t)cnv->toUnicodeStatus;
    964     if(prev==0) {
    965         prev=BOCU1_ASCII_PREV;
    966     }
    967     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
    968     count=diff&3;
    969     diff>>=2;
    970 
    971     byteIndex=cnv->toULength;
    972     bytes=cnv->toUBytes;
    973 
    974     /* sourceIndex=-1 if the current character began in the previous buffer */
    975     sourceIndex=byteIndex==0 ? 0 : -1;
    976     nextSourceIndex=0;
    977 
    978     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
    979     if(count>0 && byteIndex>0 && target<targetLimit) {
    980         goto getTrail;
    981     }
    982 
    983 fastSingle:
    984     /* fast loop for single-byte differences */
    985     /* use count as the only loop counter variable */
    986     diff=(int32_t)(sourceLimit-source);
    987     count=(int32_t)(pArgs->targetLimit-target);
    988     if(count>diff) {
    989         count=diff;
    990     }
    991     while(count>0) {
    992         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
    993             c=prev+(c-BOCU1_MIDDLE);
    994             if(c<0x3000) {
    995                 *target++=(UChar)c;
    996                 *offsets++=nextSourceIndex++;
    997                 prev=BOCU1_SIMPLE_PREV(c);
    998             } else {
    999                 break;
   1000             }
   1001         } else if(c<=0x20) {
   1002             if(c!=0x20) {
   1003                 prev=BOCU1_ASCII_PREV;
   1004             }
   1005             *target++=(UChar)c;
   1006             *offsets++=nextSourceIndex++;
   1007         } else {
   1008             break;
   1009         }
   1010         ++source;
   1011         --count;
   1012     }
   1013     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
   1014 
   1015     /* decode a sequence of single and lead bytes */
   1016     while(source<sourceLimit) {
   1017         if(target>=targetLimit) {
   1018             /* target is full */
   1019             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1020             break;
   1021         }
   1022 
   1023         ++nextSourceIndex;
   1024         c=*source++;
   1025         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
   1026             /* Write a code point directly from a single-byte difference. */
   1027             c=prev+(c-BOCU1_MIDDLE);
   1028             if(c<0x3000) {
   1029                 *target++=(UChar)c;
   1030                 *offsets++=sourceIndex;
   1031                 prev=BOCU1_SIMPLE_PREV(c);
   1032                 sourceIndex=nextSourceIndex;
   1033                 goto fastSingle;
   1034             }
   1035         } else if(c<=0x20) {
   1036             /*
   1037              * Direct-encoded C0 control code or space.
   1038              * Reset prev for C0 control codes but not for space.
   1039              */
   1040             if(c!=0x20) {
   1041                 prev=BOCU1_ASCII_PREV;
   1042             }
   1043             *target++=(UChar)c;
   1044             *offsets++=sourceIndex;
   1045             sourceIndex=nextSourceIndex;
   1046             continue;
   1047         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
   1048             /* Optimize two-byte case. */
   1049             if(c>=BOCU1_MIDDLE) {
   1050                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
   1051             } else {
   1052                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
   1053             }
   1054 
   1055             /* trail byte */
   1056             ++nextSourceIndex;
   1057             c=decodeBocu1TrailByte(1, *source++);
   1058             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
   1059                 bytes[0]=source[-2];
   1060                 bytes[1]=source[-1];
   1061                 byteIndex=2;
   1062                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1063                 break;
   1064             }
   1065         } else if(c==BOCU1_RESET) {
   1066             /* only reset the state, no code point */
   1067             prev=BOCU1_ASCII_PREV;
   1068             sourceIndex=nextSourceIndex;
   1069             continue;
   1070         } else {
   1071             /*
   1072              * For multi-byte difference lead bytes, set the decoder state
   1073              * with the partial difference value from the lead byte and
   1074              * with the number of trail bytes.
   1075              */
   1076             bytes[0]=(uint8_t)c;
   1077             byteIndex=1;
   1078 
   1079             diff=decodeBocu1LeadByte(c);
   1080             count=diff&3;
   1081             diff>>=2;
   1082 getTrail:
   1083             for(;;) {
   1084                 if(source>=sourceLimit) {
   1085                     goto endloop;
   1086                 }
   1087                 ++nextSourceIndex;
   1088                 c=bytes[byteIndex++]=*source++;
   1089 
   1090                 /* trail byte in any position */
   1091                 c=decodeBocu1TrailByte(count, c);
   1092                 if(c<0) {
   1093                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1094                     goto endloop;
   1095                 }
   1096 
   1097                 diff+=c;
   1098                 if(--count==0) {
   1099                     /* final trail byte, deliver a code point */
   1100                     byteIndex=0;
   1101                     c=prev+diff;
   1102                     if((uint32_t)c>0x10ffff) {
   1103                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1104                         goto endloop;
   1105                     }
   1106                     break;
   1107                 }
   1108             }
   1109         }
   1110 
   1111         /* calculate the next prev and output c */
   1112         prev=BOCU1_PREV(c);
   1113         if(c<=0xffff) {
   1114             *target++=(UChar)c;
   1115             *offsets++=sourceIndex;
   1116         } else {
   1117             /* output surrogate pair */
   1118             *target++=U16_LEAD(c);
   1119             if(target<targetLimit) {
   1120                 *target++=U16_TRAIL(c);
   1121                 *offsets++=sourceIndex;
   1122                 *offsets++=sourceIndex;
   1123             } else {
   1124                 /* target overflow */
   1125                 *offsets++=sourceIndex;
   1126                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
   1127                 cnv->UCharErrorBufferLength=1;
   1128                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1129                 break;
   1130             }
   1131         }
   1132         sourceIndex=nextSourceIndex;
   1133     }
   1134 endloop:
   1135 
   1136     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
   1137         /* set the converter state in UConverter to deal with the next character */
   1138         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
   1139         cnv->mode=0;
   1140     } else {
   1141         /* set the converter state back into UConverter */
   1142         cnv->toUnicodeStatus=(uint32_t)prev;
   1143         cnv->mode=(diff<<2)|count;
   1144     }
   1145     cnv->toULength=byteIndex;
   1146 
   1147     /* write back the updated pointers */
   1148     pArgs->source=(const char *)source;
   1149     pArgs->target=target;
   1150     pArgs->offsets=offsets;
   1151     return;
   1152 }
   1153 
   1154 /*
   1155  * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
   1156  * If a change is made in the original function, then either
   1157  * change this function the same way or
   1158  * re-copy the original function and remove the variables
   1159  * offsets, sourceIndex, and nextSourceIndex.
   1160  */
   1161 static void
   1162 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
   1163                 UErrorCode *pErrorCode) {
   1164     UConverter *cnv;
   1165     const uint8_t *source, *sourceLimit;
   1166     UChar *target;
   1167     const UChar *targetLimit;
   1168 
   1169     int32_t prev, count, diff, c;
   1170 
   1171     int8_t byteIndex;
   1172     uint8_t *bytes;
   1173 
   1174 U_ALIGN_CODE(16)
   1175 
   1176     /* set up the local pointers */
   1177     cnv=pArgs->converter;
   1178     source=(const uint8_t *)pArgs->source;
   1179     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   1180     target=pArgs->target;
   1181     targetLimit=pArgs->targetLimit;
   1182 
   1183     /* get the converter state from UConverter */
   1184     prev=(int32_t)cnv->toUnicodeStatus;
   1185     if(prev==0) {
   1186         prev=BOCU1_ASCII_PREV;
   1187     }
   1188     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
   1189     count=diff&3;
   1190     diff>>=2;
   1191 
   1192     byteIndex=cnv->toULength;
   1193     bytes=cnv->toUBytes;
   1194 
   1195     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
   1196     if(count>0 && byteIndex>0 && target<targetLimit) {
   1197         goto getTrail;
   1198     }
   1199 
   1200 fastSingle:
   1201     /* fast loop for single-byte differences */
   1202     /* use count as the only loop counter variable */
   1203     diff=(int32_t)(sourceLimit-source);
   1204     count=(int32_t)(pArgs->targetLimit-target);
   1205     if(count>diff) {
   1206         count=diff;
   1207     }
   1208     while(count>0) {
   1209         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
   1210             c=prev+(c-BOCU1_MIDDLE);
   1211             if(c<0x3000) {
   1212                 *target++=(UChar)c;
   1213                 prev=BOCU1_SIMPLE_PREV(c);
   1214             } else {
   1215                 break;
   1216             }
   1217         } else if(c<=0x20) {
   1218             if(c!=0x20) {
   1219                 prev=BOCU1_ASCII_PREV;
   1220             }
   1221             *target++=(UChar)c;
   1222         } else {
   1223             break;
   1224         }
   1225         ++source;
   1226         --count;
   1227     }
   1228 
   1229     /* decode a sequence of single and lead bytes */
   1230     while(source<sourceLimit) {
   1231         if(target>=targetLimit) {
   1232             /* target is full */
   1233             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1234             break;
   1235         }
   1236 
   1237         c=*source++;
   1238         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
   1239             /* Write a code point directly from a single-byte difference. */
   1240             c=prev+(c-BOCU1_MIDDLE);
   1241             if(c<0x3000) {
   1242                 *target++=(UChar)c;
   1243                 prev=BOCU1_SIMPLE_PREV(c);
   1244                 goto fastSingle;
   1245             }
   1246         } else if(c<=0x20) {
   1247             /*
   1248              * Direct-encoded C0 control code or space.
   1249              * Reset prev for C0 control codes but not for space.
   1250              */
   1251             if(c!=0x20) {
   1252                 prev=BOCU1_ASCII_PREV;
   1253             }
   1254             *target++=(UChar)c;
   1255             continue;
   1256         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
   1257             /* Optimize two-byte case. */
   1258             if(c>=BOCU1_MIDDLE) {
   1259                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
   1260             } else {
   1261                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
   1262             }
   1263 
   1264             /* trail byte */
   1265             c=decodeBocu1TrailByte(1, *source++);
   1266             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
   1267                 bytes[0]=source[-2];
   1268                 bytes[1]=source[-1];
   1269                 byteIndex=2;
   1270                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1271                 break;
   1272             }
   1273         } else if(c==BOCU1_RESET) {
   1274             /* only reset the state, no code point */
   1275             prev=BOCU1_ASCII_PREV;
   1276             continue;
   1277         } else {
   1278             /*
   1279              * For multi-byte difference lead bytes, set the decoder state
   1280              * with the partial difference value from the lead byte and
   1281              * with the number of trail bytes.
   1282              */
   1283             bytes[0]=(uint8_t)c;
   1284             byteIndex=1;
   1285 
   1286             diff=decodeBocu1LeadByte(c);
   1287             count=diff&3;
   1288             diff>>=2;
   1289 getTrail:
   1290             for(;;) {
   1291                 if(source>=sourceLimit) {
   1292                     goto endloop;
   1293                 }
   1294                 c=bytes[byteIndex++]=*source++;
   1295 
   1296                 /* trail byte in any position */
   1297                 c=decodeBocu1TrailByte(count, c);
   1298                 if(c<0) {
   1299                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1300                     goto endloop;
   1301                 }
   1302 
   1303                 diff+=c;
   1304                 if(--count==0) {
   1305                     /* final trail byte, deliver a code point */
   1306                     byteIndex=0;
   1307                     c=prev+diff;
   1308                     if((uint32_t)c>0x10ffff) {
   1309                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1310                         goto endloop;
   1311                     }
   1312                     break;
   1313                 }
   1314             }
   1315         }
   1316 
   1317         /* calculate the next prev and output c */
   1318         prev=BOCU1_PREV(c);
   1319         if(c<=0xffff) {
   1320             *target++=(UChar)c;
   1321         } else {
   1322             /* output surrogate pair */
   1323             *target++=U16_LEAD(c);
   1324             if(target<targetLimit) {
   1325                 *target++=U16_TRAIL(c);
   1326             } else {
   1327                 /* target overflow */
   1328                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
   1329                 cnv->UCharErrorBufferLength=1;
   1330                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1331                 break;
   1332             }
   1333         }
   1334     }
   1335 endloop:
   1336 
   1337     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
   1338         /* set the converter state in UConverter to deal with the next character */
   1339         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
   1340         cnv->mode=0;
   1341     } else {
   1342         /* set the converter state back into UConverter */
   1343         cnv->toUnicodeStatus=(uint32_t)prev;
   1344         cnv->mode=(diff<<2)|count;
   1345     }
   1346     cnv->toULength=byteIndex;
   1347 
   1348     /* write back the updated pointers */
   1349     pArgs->source=(const char *)source;
   1350     pArgs->target=target;
   1351     return;
   1352 }
   1353 
   1354 /* miscellaneous ------------------------------------------------------------ */
   1355 
   1356 static const UConverterImpl _Bocu1Impl={
   1357     UCNV_BOCU1,
   1358 
   1359     NULL,
   1360     NULL,
   1361 
   1362     NULL,
   1363     NULL,
   1364     NULL,
   1365 
   1366     _Bocu1ToUnicode,
   1367     _Bocu1ToUnicodeWithOffsets,
   1368     _Bocu1FromUnicode,
   1369     _Bocu1FromUnicodeWithOffsets,
   1370     NULL,
   1371 
   1372     NULL,
   1373     NULL,
   1374     NULL,
   1375     NULL,
   1376     ucnv_getCompleteUnicodeSet,
   1377 
   1378     NULL,
   1379     NULL
   1380 };
   1381 
   1382 static const UConverterStaticData _Bocu1StaticData={
   1383     sizeof(UConverterStaticData),
   1384     "BOCU-1",
   1385     1214, /* CCSID for BOCU-1 */
   1386     UCNV_IBM, UCNV_BOCU1,
   1387     1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
   1388     { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
   1389     FALSE, FALSE,
   1390     0,
   1391     0,
   1392     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1393 };
   1394 
   1395 const UConverterSharedData _Bocu1Data={
   1396     sizeof(UConverterSharedData), ~((uint32_t)0),
   1397     NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl,
   1398     0,
   1399     UCNV_MBCS_TABLE_INITIALIZER
   1400 };
   1401 
   1402 #endif
   1403