Home | History | Annotate | Download | only in common
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2002-2015, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  ucnvbocu.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002mar27
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This is an implementation of the Binary Ordered Compression for Unicode,
     17 *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 
     22 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
     23 
     24 #include "unicode/ucnv.h"
     25 #include "unicode/ucnv_cb.h"
     26 #include "unicode/utf16.h"
     27 #include "putilimp.h"
     28 #include "ucnv_bld.h"
     29 #include "ucnv_cnv.h"
     30 #include "uassert.h"
     31 
     32 /* BOCU-1 constants and macros ---------------------------------------------- */
     33 
     34 /*
     35  * BOCU-1 encodes the code points of a Unicode string as
     36  * a sequence of byte-encoded differences (slope detection),
     37  * preserving lexical order.
     38  *
     39  * Optimize the difference-taking for runs of Unicode text within
     40  * small scripts:
     41  *
     42  * Most small scripts are allocated within aligned 128-blocks of Unicode
     43  * code points. Lexical order is preserved if the "previous code point" state
     44  * is always moved into the middle of such a block.
     45  *
     46  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
     47  * areas into the middle of those areas.
     48  *
     49  * C0 control codes and space are encoded with their US-ASCII bytes.
     50  * "prev" is reset for C0 controls but not for space.
     51  */
     52 
     53 /* initial value for "prev": middle of the ASCII range */
     54 #define BOCU1_ASCII_PREV        0x40
     55 
     56 /* bounding byte values for differences */
     57 #define BOCU1_MIN               0x21
     58 #define BOCU1_MIDDLE            0x90
     59 #define BOCU1_MAX_LEAD          0xfe
     60 #define BOCU1_MAX_TRAIL         0xff
     61 #define BOCU1_RESET             0xff
     62 
     63 /* number of lead bytes */
     64 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
     65 
     66 /* adjust trail byte counts for the use of some C0 control byte values */
     67 #define BOCU1_TRAIL_CONTROLS_COUNT  20
     68 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
     69 
     70 /* number of trail bytes */
     71 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
     72 
     73 /*
     74  * number of positive and negative single-byte codes
     75  * (counting 0==BOCU1_MIDDLE among the positive ones)
     76  */
     77 #define BOCU1_SINGLE            64
     78 
     79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
     80 #define BOCU1_LEAD_2            43
     81 #define BOCU1_LEAD_3            3
     82 #define BOCU1_LEAD_4            1
     83 
     84 /* The difference value range for single-byters. */
     85 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
     86 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
     87 
     88 /* The difference value range for double-byters. */
     89 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
     90 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
     91 
     92 /* The difference value range for 3-byters. */
     93 #define BOCU1_REACH_POS_3   \
     94     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
     95 
     96 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
     97 
     98 /* The lead byte start values. */
     99 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
    100 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
    101 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
    102      /* ==BOCU1_MAX_LEAD */
    103 
    104 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
    105 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
    106 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
    107      /* ==BOCU1_MIN+1 */
    108 
    109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
    110 #define BOCU1_LENGTH_FROM_LEAD(lead) \
    111     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
    112      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
    113      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
    114 
    115 /* The length of a byte sequence, according to its packed form. */
    116 #define BOCU1_LENGTH_FROM_PACKED(packed) \
    117     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
    118 
    119 /*
    120  * 12 commonly used C0 control codes (and space) are only used to encode
    121  * themselves directly,
    122  * which makes BOCU-1 MIME-usable and reasonably safe for
    123  * ASCII-oriented software.
    124  *
    125  * These controls are
    126  *  0   NUL
    127  *
    128  *  7   BEL
    129  *  8   BS
    130  *
    131  *  9   TAB
    132  *  a   LF
    133  *  b   VT
    134  *  c   FF
    135  *  d   CR
    136  *
    137  *  e   SO
    138  *  f   SI
    139  *
    140  * 1a   SUB
    141  * 1b   ESC
    142  *
    143  * The other 20 C0 controls are also encoded directly (to preserve order)
    144  * but are also used as trail bytes in difference encoding
    145  * (for better compression).
    146  */
    147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
    148 
    149 /*
    150  * Byte value map for control codes,
    151  * from external byte values 0x00..0x20
    152  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
    153  * External byte values that are illegal as trail bytes are mapped to -1.
    154  */
    155 static const int8_t
    156 bocu1ByteToTrail[BOCU1_MIN]={
    157 /*  0     1     2     3     4     5     6     7    */
    158     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
    159 
    160 /*  8     9     a     b     c     d     e     f    */
    161     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
    162 
    163 /*  10    11    12    13    14    15    16    17   */
    164     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
    165 
    166 /*  18    19    1a    1b    1c    1d    1e    1f   */
    167     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
    168 
    169 /*  20   */
    170     -1
    171 };
    172 
    173 /*
    174  * Byte value map for control codes,
    175  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
    176  * to external byte values 0x00..0x20.
    177  */
    178 static const int8_t
    179 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
    180 /*  0     1     2     3     4     5     6     7    */
    181     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
    182 
    183 /*  8     9     a     b     c     d     e     f    */
    184     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
    185 
    186 /*  10    11    12    13   */
    187     0x1c, 0x1d, 0x1e, 0x1f
    188 };
    189 
    190 /**
    191  * Integer division and modulo with negative numerators
    192  * yields negative modulo results and quotients that are one more than
    193  * what we need here.
    194  * This macro adjust the results so that the modulo-value m is always >=0.
    195  *
    196  * For positive n, the if() condition is always FALSE.
    197  *
    198  * @param n Number to be split into quotient and rest.
    199  *          Will be modified to contain the quotient.
    200  * @param d Divisor.
    201  * @param m Output variable for the rest (modulo result).
    202  */
    203 #define NEGDIVMOD(n, d, m) { \
    204     (m)=(n)%(d); \
    205     (n)/=(d); \
    206     if((m)<0) { \
    207         --(n); \
    208         (m)+=(d); \
    209     } \
    210 }
    211 
    212 /* Faster versions of packDiff() for single-byte-encoded diff values. */
    213 
    214 /** Is a diff value encodable in a single byte? */
    215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
    216 
    217 /** Encode a diff value in a single byte. */
    218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
    219 
    220 /** Is a diff value encodable in two bytes? */
    221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
    222 
    223 /* BOCU-1 implementation functions ------------------------------------------ */
    224 
    225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
    226 
    227 /**
    228  * Compute the next "previous" value for differencing
    229  * from the current code point.
    230  *
    231  * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
    232  * @return "previous code point" state value
    233  */
    234 static inline int32_t
    235 bocu1Prev(int32_t c) {
    236     /* compute new prev */
    237     if(/* 0x3040<=c && */ c<=0x309f) {
    238         /* Hiragana is not 128-aligned */
    239         return 0x3070;
    240     } else if(0x4e00<=c && c<=0x9fa5) {
    241         /* CJK Unihan */
    242         return 0x4e00-BOCU1_REACH_NEG_2;
    243     } else if(0xac00<=c /* && c<=0xd7a3 */) {
    244         /* Korean Hangul */
    245         return (0xd7a3+0xac00)/2;
    246     } else {
    247         /* mostly small scripts */
    248         return BOCU1_SIMPLE_PREV(c);
    249     }
    250 }
    251 
    252 /** Fast version of bocu1Prev() for most scripts. */
    253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
    254 
    255 /*
    256  * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
    257  * The UConverter fields are used as follows:
    258  *
    259  * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
    260  *
    261  * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
    262  * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
    263  */
    264 
    265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
    266 
    267 /**
    268  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
    269  * and return a packed integer with them.
    270  *
    271  * The encoding favors small absolute differences with short encodings
    272  * to compress runs of same-script characters.
    273  *
    274  * Optimized version with unrolled loops and fewer floating-point operations
    275  * than the standard packDiff().
    276  *
    277  * @param diff difference value -0x10ffff..0x10ffff
    278  * @return
    279  *      0x010000zz for 1-byte sequence zz
    280  *      0x0200yyzz for 2-byte sequence yy zz
    281  *      0x03xxyyzz for 3-byte sequence xx yy zz
    282  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
    283  */
    284 static int32_t
    285 packDiff(int32_t diff) {
    286     int32_t result, m;
    287 
    288     U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
    289     if(diff>=BOCU1_REACH_NEG_1) {
    290         /* mostly positive differences, and single-byte negative ones */
    291 #if 0   /* single-byte case handled in macros, see below */
    292         if(diff<=BOCU1_REACH_POS_1) {
    293             /* single byte */
    294             return 0x01000000|(BOCU1_MIDDLE+diff);
    295         } else
    296 #endif
    297         if(diff<=BOCU1_REACH_POS_2) {
    298             /* two bytes */
    299             diff-=BOCU1_REACH_POS_1+1;
    300             result=0x02000000;
    301 
    302             m=diff%BOCU1_TRAIL_COUNT;
    303             diff/=BOCU1_TRAIL_COUNT;
    304             result|=BOCU1_TRAIL_TO_BYTE(m);
    305 
    306             result|=(BOCU1_START_POS_2+diff)<<8;
    307         } else if(diff<=BOCU1_REACH_POS_3) {
    308             /* three bytes */
    309             diff-=BOCU1_REACH_POS_2+1;
    310             result=0x03000000;
    311 
    312             m=diff%BOCU1_TRAIL_COUNT;
    313             diff/=BOCU1_TRAIL_COUNT;
    314             result|=BOCU1_TRAIL_TO_BYTE(m);
    315 
    316             m=diff%BOCU1_TRAIL_COUNT;
    317             diff/=BOCU1_TRAIL_COUNT;
    318             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    319 
    320             result|=(BOCU1_START_POS_3+diff)<<16;
    321         } else {
    322             /* four bytes */
    323             diff-=BOCU1_REACH_POS_3+1;
    324 
    325             m=diff%BOCU1_TRAIL_COUNT;
    326             diff/=BOCU1_TRAIL_COUNT;
    327             result=BOCU1_TRAIL_TO_BYTE(m);
    328 
    329             m=diff%BOCU1_TRAIL_COUNT;
    330             diff/=BOCU1_TRAIL_COUNT;
    331             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    332 
    333             /*
    334              * We know that / and % would deliver quotient 0 and rest=diff.
    335              * Avoid division and modulo for performance.
    336              */
    337             result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
    338 
    339             result|=((uint32_t)BOCU1_START_POS_4)<<24;
    340         }
    341     } else {
    342         /* two- to four-byte negative differences */
    343         if(diff>=BOCU1_REACH_NEG_2) {
    344             /* two bytes */
    345             diff-=BOCU1_REACH_NEG_1;
    346             result=0x02000000;
    347 
    348             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    349             result|=BOCU1_TRAIL_TO_BYTE(m);
    350 
    351             result|=(BOCU1_START_NEG_2+diff)<<8;
    352         } else if(diff>=BOCU1_REACH_NEG_3) {
    353             /* three bytes */
    354             diff-=BOCU1_REACH_NEG_2;
    355             result=0x03000000;
    356 
    357             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    358             result|=BOCU1_TRAIL_TO_BYTE(m);
    359 
    360             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    361             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    362 
    363             result|=(BOCU1_START_NEG_3+diff)<<16;
    364         } else {
    365             /* four bytes */
    366             diff-=BOCU1_REACH_NEG_3;
    367 
    368             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    369             result=BOCU1_TRAIL_TO_BYTE(m);
    370 
    371             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    372             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    373 
    374             /*
    375              * We know that NEGDIVMOD would deliver
    376              * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
    377              * Avoid division and modulo for performance.
    378              */
    379             m=diff+BOCU1_TRAIL_COUNT;
    380             result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
    381 
    382             result|=BOCU1_MIN<<24;
    383         }
    384     }
    385     return result;
    386 }
    387 
    388 
    389 static void
    390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
    391                              UErrorCode *pErrorCode) {
    392     UConverter *cnv;
    393     const UChar *source, *sourceLimit;
    394     uint8_t *target;
    395     int32_t targetCapacity;
    396     int32_t *offsets;
    397 
    398     int32_t prev, c, diff;
    399 
    400     int32_t sourceIndex, nextSourceIndex;
    401 
    402     /* set up the local pointers */
    403     cnv=pArgs->converter;
    404     source=pArgs->source;
    405     sourceLimit=pArgs->sourceLimit;
    406     target=(uint8_t *)pArgs->target;
    407     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
    408     offsets=pArgs->offsets;
    409 
    410     /* get the converter state from UConverter */
    411     c=cnv->fromUChar32;
    412     prev=(int32_t)cnv->fromUnicodeStatus;
    413     if(prev==0) {
    414         prev=BOCU1_ASCII_PREV;
    415     }
    416 
    417     /* sourceIndex=-1 if the current character began in the previous buffer */
    418     sourceIndex= c==0 ? 0 : -1;
    419     nextSourceIndex=0;
    420 
    421     /* conversion loop */
    422     if(c!=0 && targetCapacity>0) {
    423         goto getTrail;
    424     }
    425 
    426 fastSingle:
    427     /* fast loop for single-byte differences */
    428     /* use only one loop counter variable, targetCapacity, not also source */
    429     diff=(int32_t)(sourceLimit-source);
    430     if(targetCapacity>diff) {
    431         targetCapacity=diff;
    432     }
    433     while(targetCapacity>0 && (c=*source)<0x3000) {
    434         if(c<=0x20) {
    435             if(c!=0x20) {
    436                 prev=BOCU1_ASCII_PREV;
    437             }
    438             *target++=(uint8_t)c;
    439             *offsets++=nextSourceIndex++;
    440             ++source;
    441             --targetCapacity;
    442         } else {
    443             diff=c-prev;
    444             if(DIFF_IS_SINGLE(diff)) {
    445                 prev=BOCU1_SIMPLE_PREV(c);
    446                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
    447                 *offsets++=nextSourceIndex++;
    448                 ++source;
    449                 --targetCapacity;
    450             } else {
    451                 break;
    452             }
    453         }
    454     }
    455     /* restore real values */
    456     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
    457     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
    458 
    459     /* regular loop for all cases */
    460     while(source<sourceLimit) {
    461         if(targetCapacity>0) {
    462             c=*source++;
    463             ++nextSourceIndex;
    464 
    465             if(c<=0x20) {
    466                 /*
    467                  * ISO C0 control & space:
    468                  * Encode directly for MIME compatibility,
    469                  * and reset state except for space, to not disrupt compression.
    470                  */
    471                 if(c!=0x20) {
    472                     prev=BOCU1_ASCII_PREV;
    473                 }
    474                 *target++=(uint8_t)c;
    475                 *offsets++=sourceIndex;
    476                 --targetCapacity;
    477 
    478                 sourceIndex=nextSourceIndex;
    479                 continue;
    480             }
    481 
    482             if(U16_IS_LEAD(c)) {
    483 getTrail:
    484                 if(source<sourceLimit) {
    485                     /* test the following code unit */
    486                     UChar trail=*source;
    487                     if(U16_IS_TRAIL(trail)) {
    488                         ++source;
    489                         ++nextSourceIndex;
    490                         c=U16_GET_SUPPLEMENTARY(c, trail);
    491                     }
    492                 } else {
    493                     /* no more input */
    494                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
    495                     break;
    496                 }
    497             }
    498 
    499             /*
    500              * all other Unicode code points c==U+0021..U+10ffff
    501              * are encoded with the difference c-prev
    502              *
    503              * a new prev is computed from c,
    504              * placed in the middle of a 0x80-block (for most small scripts) or
    505              * in the middle of the Unihan and Hangul blocks
    506              * to statistically minimize the following difference
    507              */
    508             diff=c-prev;
    509             prev=BOCU1_PREV(c);
    510             if(DIFF_IS_SINGLE(diff)) {
    511                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
    512                 *offsets++=sourceIndex;
    513                 --targetCapacity;
    514                 sourceIndex=nextSourceIndex;
    515                 if(c<0x3000) {
    516                     goto fastSingle;
    517                 }
    518             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
    519                 /* optimize 2-byte case */
    520                 int32_t m;
    521 
    522                 if(diff>=0) {
    523                     diff-=BOCU1_REACH_POS_1+1;
    524                     m=diff%BOCU1_TRAIL_COUNT;
    525                     diff/=BOCU1_TRAIL_COUNT;
    526                     diff+=BOCU1_START_POS_2;
    527                 } else {
    528                     diff-=BOCU1_REACH_NEG_1;
    529                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    530                     diff+=BOCU1_START_NEG_2;
    531                 }
    532                 *target++=(uint8_t)diff;
    533                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
    534                 *offsets++=sourceIndex;
    535                 *offsets++=sourceIndex;
    536                 targetCapacity-=2;
    537                 sourceIndex=nextSourceIndex;
    538             } else {
    539                 int32_t length; /* will be 2..4 */
    540 
    541                 diff=packDiff(diff);
    542                 length=BOCU1_LENGTH_FROM_PACKED(diff);
    543 
    544                 /* write the output character bytes from diff and length */
    545                 /* from the first if in the loop we know that targetCapacity>0 */
    546                 if(length<=targetCapacity) {
    547                     switch(length) {
    548                         /* each branch falls through to the next one */
    549                     case 4:
    550                         *target++=(uint8_t)(diff>>24);
    551                         *offsets++=sourceIndex;
    552                     case 3: /*fall through*/
    553                         *target++=(uint8_t)(diff>>16);
    554                         *offsets++=sourceIndex;
    555                     case 2: /*fall through*/
    556                         *target++=(uint8_t)(diff>>8);
    557                         *offsets++=sourceIndex;
    558                     /* case 1: handled above */
    559                         *target++=(uint8_t)diff;
    560                         *offsets++=sourceIndex;
    561                     default:
    562                         /* will never occur */
    563                         break;
    564                     }
    565                     targetCapacity-=length;
    566                     sourceIndex=nextSourceIndex;
    567                 } else {
    568                     uint8_t *charErrorBuffer;
    569 
    570                     /*
    571                      * We actually do this backwards here:
    572                      * In order to save an intermediate variable, we output
    573                      * first to the overflow buffer what does not fit into the
    574                      * regular target.
    575                      */
    576                     /* we know that 1<=targetCapacity<length<=4 */
    577                     length-=targetCapacity;
    578                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
    579                     switch(length) {
    580                         /* each branch falls through to the next one */
    581                     case 3:
    582                         *charErrorBuffer++=(uint8_t)(diff>>16);
    583                     case 2: /*fall through*/
    584                         *charErrorBuffer++=(uint8_t)(diff>>8);
    585                     case 1: /*fall through*/
    586                         *charErrorBuffer=(uint8_t)diff;
    587                     default:
    588                         /* will never occur */
    589                         break;
    590                     }
    591                     cnv->charErrorBufferLength=(int8_t)length;
    592 
    593                     /* now output what fits into the regular target */
    594                     diff>>=8*length; /* length was reduced by targetCapacity */
    595                     switch(targetCapacity) {
    596                         /* each branch falls through to the next one */
    597                     case 3:
    598                         *target++=(uint8_t)(diff>>16);
    599                         *offsets++=sourceIndex;
    600                     case 2: /*fall through*/
    601                         *target++=(uint8_t)(diff>>8);
    602                         *offsets++=sourceIndex;
    603                     case 1: /*fall through*/
    604                         *target++=(uint8_t)diff;
    605                         *offsets++=sourceIndex;
    606                     default:
    607                         /* will never occur */
    608                         break;
    609                     }
    610 
    611                     /* target overflow */
    612                     targetCapacity=0;
    613                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    614                     break;
    615                 }
    616             }
    617         } else {
    618             /* target is full */
    619             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    620             break;
    621         }
    622     }
    623 
    624     /* set the converter state back into UConverter */
    625     cnv->fromUChar32= c<0 ? -c : 0;
    626     cnv->fromUnicodeStatus=(uint32_t)prev;
    627 
    628     /* write back the updated pointers */
    629     pArgs->source=source;
    630     pArgs->target=(char *)target;
    631     pArgs->offsets=offsets;
    632 }
    633 
    634 /*
    635  * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
    636  * If a change is made in the original function, then either
    637  * change this function the same way or
    638  * re-copy the original function and remove the variables
    639  * offsets, sourceIndex, and nextSourceIndex.
    640  */
    641 static void
    642 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
    643                   UErrorCode *pErrorCode) {
    644     UConverter *cnv;
    645     const UChar *source, *sourceLimit;
    646     uint8_t *target;
    647     int32_t targetCapacity;
    648 
    649     int32_t prev, c, diff;
    650 
    651     /* set up the local pointers */
    652     cnv=pArgs->converter;
    653     source=pArgs->source;
    654     sourceLimit=pArgs->sourceLimit;
    655     target=(uint8_t *)pArgs->target;
    656     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
    657 
    658     /* get the converter state from UConverter */
    659     c=cnv->fromUChar32;
    660     prev=(int32_t)cnv->fromUnicodeStatus;
    661     if(prev==0) {
    662         prev=BOCU1_ASCII_PREV;
    663     }
    664 
    665     /* conversion loop */
    666     if(c!=0 && targetCapacity>0) {
    667         goto getTrail;
    668     }
    669 
    670 fastSingle:
    671     /* fast loop for single-byte differences */
    672     /* use only one loop counter variable, targetCapacity, not also source */
    673     diff=(int32_t)(sourceLimit-source);
    674     if(targetCapacity>diff) {
    675         targetCapacity=diff;
    676     }
    677     while(targetCapacity>0 && (c=*source)<0x3000) {
    678         if(c<=0x20) {
    679             if(c!=0x20) {
    680                 prev=BOCU1_ASCII_PREV;
    681             }
    682             *target++=(uint8_t)c;
    683         } else {
    684             diff=c-prev;
    685             if(DIFF_IS_SINGLE(diff)) {
    686                 prev=BOCU1_SIMPLE_PREV(c);
    687                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
    688             } else {
    689                 break;
    690             }
    691         }
    692         ++source;
    693         --targetCapacity;
    694     }
    695     /* restore real values */
    696     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
    697 
    698     /* regular loop for all cases */
    699     while(source<sourceLimit) {
    700         if(targetCapacity>0) {
    701             c=*source++;
    702 
    703             if(c<=0x20) {
    704                 /*
    705                  * ISO C0 control & space:
    706                  * Encode directly for MIME compatibility,
    707                  * and reset state except for space, to not disrupt compression.
    708                  */
    709                 if(c!=0x20) {
    710                     prev=BOCU1_ASCII_PREV;
    711                 }
    712                 *target++=(uint8_t)c;
    713                 --targetCapacity;
    714                 continue;
    715             }
    716 
    717             if(U16_IS_LEAD(c)) {
    718 getTrail:
    719                 if(source<sourceLimit) {
    720                     /* test the following code unit */
    721                     UChar trail=*source;
    722                     if(U16_IS_TRAIL(trail)) {
    723                         ++source;
    724                         c=U16_GET_SUPPLEMENTARY(c, trail);
    725                     }
    726                 } else {
    727                     /* no more input */
    728                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
    729                     break;
    730                 }
    731             }
    732 
    733             /*
    734              * all other Unicode code points c==U+0021..U+10ffff
    735              * are encoded with the difference c-prev
    736              *
    737              * a new prev is computed from c,
    738              * placed in the middle of a 0x80-block (for most small scripts) or
    739              * in the middle of the Unihan and Hangul blocks
    740              * to statistically minimize the following difference
    741              */
    742             diff=c-prev;
    743             prev=BOCU1_PREV(c);
    744             if(DIFF_IS_SINGLE(diff)) {
    745                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
    746                 --targetCapacity;
    747                 if(c<0x3000) {
    748                     goto fastSingle;
    749                 }
    750             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
    751                 /* optimize 2-byte case */
    752                 int32_t m;
    753 
    754                 if(diff>=0) {
    755                     diff-=BOCU1_REACH_POS_1+1;
    756                     m=diff%BOCU1_TRAIL_COUNT;
    757                     diff/=BOCU1_TRAIL_COUNT;
    758                     diff+=BOCU1_START_POS_2;
    759                 } else {
    760                     diff-=BOCU1_REACH_NEG_1;
    761                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    762                     diff+=BOCU1_START_NEG_2;
    763                 }
    764                 *target++=(uint8_t)diff;
    765                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
    766                 targetCapacity-=2;
    767             } else {
    768                 int32_t length; /* will be 2..4 */
    769 
    770                 diff=packDiff(diff);
    771                 length=BOCU1_LENGTH_FROM_PACKED(diff);
    772 
    773                 /* write the output character bytes from diff and length */
    774                 /* from the first if in the loop we know that targetCapacity>0 */
    775                 if(length<=targetCapacity) {
    776                     switch(length) {
    777                         /* each branch falls through to the next one */
    778                     case 4:
    779                         *target++=(uint8_t)(diff>>24);
    780                     case 3: /*fall through*/
    781                         *target++=(uint8_t)(diff>>16);
    782                     /* case 2: handled above */
    783                         *target++=(uint8_t)(diff>>8);
    784                     /* case 1: handled above */
    785                         *target++=(uint8_t)diff;
    786                     default:
    787                         /* will never occur */
    788                         break;
    789                     }
    790                     targetCapacity-=length;
    791                 } else {
    792                     uint8_t *charErrorBuffer;
    793 
    794                     /*
    795                      * We actually do this backwards here:
    796                      * In order to save an intermediate variable, we output
    797                      * first to the overflow buffer what does not fit into the
    798                      * regular target.
    799                      */
    800                     /* we know that 1<=targetCapacity<length<=4 */
    801                     length-=targetCapacity;
    802                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
    803                     switch(length) {
    804                         /* each branch falls through to the next one */
    805                     case 3:
    806                         *charErrorBuffer++=(uint8_t)(diff>>16);
    807                     case 2: /*fall through*/
    808                         *charErrorBuffer++=(uint8_t)(diff>>8);
    809                     case 1: /*fall through*/
    810                         *charErrorBuffer=(uint8_t)diff;
    811                     default:
    812                         /* will never occur */
    813                         break;
    814                     }
    815                     cnv->charErrorBufferLength=(int8_t)length;
    816 
    817                     /* now output what fits into the regular target */
    818                     diff>>=8*length; /* length was reduced by targetCapacity */
    819                     switch(targetCapacity) {
    820                         /* each branch falls through to the next one */
    821                     case 3:
    822                         *target++=(uint8_t)(diff>>16);
    823                     case 2: /*fall through*/
    824                         *target++=(uint8_t)(diff>>8);
    825                     case 1: /*fall through*/
    826                         *target++=(uint8_t)diff;
    827                     default:
    828                         /* will never occur */
    829                         break;
    830                     }
    831 
    832                     /* target overflow */
    833                     targetCapacity=0;
    834                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    835                     break;
    836                 }
    837             }
    838         } else {
    839             /* target is full */
    840             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    841             break;
    842         }
    843     }
    844 
    845     /* set the converter state back into UConverter */
    846     cnv->fromUChar32= c<0 ? -c : 0;
    847     cnv->fromUnicodeStatus=(uint32_t)prev;
    848 
    849     /* write back the updated pointers */
    850     pArgs->source=source;
    851     pArgs->target=(char *)target;
    852 }
    853 
    854 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
    855 
    856 /**
    857  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
    858  *
    859  * @param b lead byte;
    860  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
    861  * @return (diff<<2)|count
    862  */
    863 static inline int32_t
    864 decodeBocu1LeadByte(int32_t b) {
    865     int32_t diff, count;
    866 
    867     if(b>=BOCU1_START_NEG_2) {
    868         /* positive difference */
    869         if(b<BOCU1_START_POS_3) {
    870             /* two bytes */
    871             diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
    872             count=1;
    873         } else if(b<BOCU1_START_POS_4) {
    874             /* three bytes */
    875             diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
    876             count=2;
    877         } else {
    878             /* four bytes */
    879             diff=BOCU1_REACH_POS_3+1;
    880             count=3;
    881         }
    882     } else {
    883         /* negative difference */
    884         if(b>=BOCU1_START_NEG_3) {
    885             /* two bytes */
    886             diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
    887             count=1;
    888         } else if(b>BOCU1_MIN) {
    889             /* three bytes */
    890             diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
    891             count=2;
    892         } else {
    893             /* four bytes */
    894             diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
    895             count=3;
    896         }
    897     }
    898 
    899     /* return the state for decoding the trail byte(s) */
    900     return (diff<<2)|count;
    901 }
    902 
    903 /**
    904  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
    905  *
    906  * @param count number of remaining trail bytes including this one
    907  * @param b trail byte
    908  * @return new delta for diff including b - <0 indicates an error
    909  *
    910  * @see decodeBocu1
    911  */
    912 static inline int32_t
    913 decodeBocu1TrailByte(int32_t count, int32_t b) {
    914     if(b<=0x20) {
    915         /* skip some C0 controls and make the trail byte range contiguous */
    916         b=bocu1ByteToTrail[b];
    917         /* b<0 for an illegal trail byte value will result in return<0 below */
    918 #if BOCU1_MAX_TRAIL<0xff
    919     } else if(b>BOCU1_MAX_TRAIL) {
    920         return -99;
    921 #endif
    922     } else {
    923         b-=BOCU1_TRAIL_BYTE_OFFSET;
    924     }
    925 
    926     /* add trail byte into difference and decrement count */
    927     if(count==1) {
    928         return b;
    929     } else if(count==2) {
    930         return b*BOCU1_TRAIL_COUNT;
    931     } else /* count==3 */ {
    932         return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
    933     }
    934 }
    935 
    936 static void
    937 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    938                            UErrorCode *pErrorCode) {
    939     UConverter *cnv;
    940     const uint8_t *source, *sourceLimit;
    941     UChar *target;
    942     const UChar *targetLimit;
    943     int32_t *offsets;
    944 
    945     int32_t prev, count, diff, c;
    946 
    947     int8_t byteIndex;
    948     uint8_t *bytes;
    949 
    950     int32_t sourceIndex, nextSourceIndex;
    951 
    952     /* set up the local pointers */
    953     cnv=pArgs->converter;
    954     source=(const uint8_t *)pArgs->source;
    955     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
    956     target=pArgs->target;
    957     targetLimit=pArgs->targetLimit;
    958     offsets=pArgs->offsets;
    959 
    960     /* get the converter state from UConverter */
    961     prev=(int32_t)cnv->toUnicodeStatus;
    962     if(prev==0) {
    963         prev=BOCU1_ASCII_PREV;
    964     }
    965     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
    966     count=diff&3;
    967     diff>>=2;
    968 
    969     byteIndex=cnv->toULength;
    970     bytes=cnv->toUBytes;
    971 
    972     /* sourceIndex=-1 if the current character began in the previous buffer */
    973     sourceIndex=byteIndex==0 ? 0 : -1;
    974     nextSourceIndex=0;
    975 
    976     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
    977     if(count>0 && byteIndex>0 && target<targetLimit) {
    978         goto getTrail;
    979     }
    980 
    981 fastSingle:
    982     /* fast loop for single-byte differences */
    983     /* use count as the only loop counter variable */
    984     diff=(int32_t)(sourceLimit-source);
    985     count=(int32_t)(pArgs->targetLimit-target);
    986     if(count>diff) {
    987         count=diff;
    988     }
    989     while(count>0) {
    990         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
    991             c=prev+(c-BOCU1_MIDDLE);
    992             if(c<0x3000) {
    993                 *target++=(UChar)c;
    994                 *offsets++=nextSourceIndex++;
    995                 prev=BOCU1_SIMPLE_PREV(c);
    996             } else {
    997                 break;
    998             }
    999         } else if(c<=0x20) {
   1000             if(c!=0x20) {
   1001                 prev=BOCU1_ASCII_PREV;
   1002             }
   1003             *target++=(UChar)c;
   1004             *offsets++=nextSourceIndex++;
   1005         } else {
   1006             break;
   1007         }
   1008         ++source;
   1009         --count;
   1010     }
   1011     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
   1012 
   1013     /* decode a sequence of single and lead bytes */
   1014     while(source<sourceLimit) {
   1015         if(target>=targetLimit) {
   1016             /* target is full */
   1017             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1018             break;
   1019         }
   1020 
   1021         ++nextSourceIndex;
   1022         c=*source++;
   1023         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
   1024             /* Write a code point directly from a single-byte difference. */
   1025             c=prev+(c-BOCU1_MIDDLE);
   1026             if(c<0x3000) {
   1027                 *target++=(UChar)c;
   1028                 *offsets++=sourceIndex;
   1029                 prev=BOCU1_SIMPLE_PREV(c);
   1030                 sourceIndex=nextSourceIndex;
   1031                 goto fastSingle;
   1032             }
   1033         } else if(c<=0x20) {
   1034             /*
   1035              * Direct-encoded C0 control code or space.
   1036              * Reset prev for C0 control codes but not for space.
   1037              */
   1038             if(c!=0x20) {
   1039                 prev=BOCU1_ASCII_PREV;
   1040             }
   1041             *target++=(UChar)c;
   1042             *offsets++=sourceIndex;
   1043             sourceIndex=nextSourceIndex;
   1044             continue;
   1045         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
   1046             /* Optimize two-byte case. */
   1047             if(c>=BOCU1_MIDDLE) {
   1048                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
   1049             } else {
   1050                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
   1051             }
   1052 
   1053             /* trail byte */
   1054             ++nextSourceIndex;
   1055             c=decodeBocu1TrailByte(1, *source++);
   1056             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
   1057                 bytes[0]=source[-2];
   1058                 bytes[1]=source[-1];
   1059                 byteIndex=2;
   1060                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1061                 break;
   1062             }
   1063         } else if(c==BOCU1_RESET) {
   1064             /* only reset the state, no code point */
   1065             prev=BOCU1_ASCII_PREV;
   1066             sourceIndex=nextSourceIndex;
   1067             continue;
   1068         } else {
   1069             /*
   1070              * For multi-byte difference lead bytes, set the decoder state
   1071              * with the partial difference value from the lead byte and
   1072              * with the number of trail bytes.
   1073              */
   1074             bytes[0]=(uint8_t)c;
   1075             byteIndex=1;
   1076 
   1077             diff=decodeBocu1LeadByte(c);
   1078             count=diff&3;
   1079             diff>>=2;
   1080 getTrail:
   1081             for(;;) {
   1082                 if(source>=sourceLimit) {
   1083                     goto endloop;
   1084                 }
   1085                 ++nextSourceIndex;
   1086                 c=bytes[byteIndex++]=*source++;
   1087 
   1088                 /* trail byte in any position */
   1089                 c=decodeBocu1TrailByte(count, c);
   1090                 if(c<0) {
   1091                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1092                     goto endloop;
   1093                 }
   1094 
   1095                 diff+=c;
   1096                 if(--count==0) {
   1097                     /* final trail byte, deliver a code point */
   1098                     byteIndex=0;
   1099                     c=prev+diff;
   1100                     if((uint32_t)c>0x10ffff) {
   1101                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1102                         goto endloop;
   1103                     }
   1104                     break;
   1105                 }
   1106             }
   1107         }
   1108 
   1109         /* calculate the next prev and output c */
   1110         prev=BOCU1_PREV(c);
   1111         if(c<=0xffff) {
   1112             *target++=(UChar)c;
   1113             *offsets++=sourceIndex;
   1114         } else {
   1115             /* output surrogate pair */
   1116             *target++=U16_LEAD(c);
   1117             if(target<targetLimit) {
   1118                 *target++=U16_TRAIL(c);
   1119                 *offsets++=sourceIndex;
   1120                 *offsets++=sourceIndex;
   1121             } else {
   1122                 /* target overflow */
   1123                 *offsets++=sourceIndex;
   1124                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
   1125                 cnv->UCharErrorBufferLength=1;
   1126                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1127                 break;
   1128             }
   1129         }
   1130         sourceIndex=nextSourceIndex;
   1131     }
   1132 endloop:
   1133 
   1134     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
   1135         /* set the converter state in UConverter to deal with the next character */
   1136         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
   1137         cnv->mode=0;
   1138     } else {
   1139         /* set the converter state back into UConverter */
   1140         cnv->toUnicodeStatus=(uint32_t)prev;
   1141         cnv->mode=(diff<<2)|count;
   1142     }
   1143     cnv->toULength=byteIndex;
   1144 
   1145     /* write back the updated pointers */
   1146     pArgs->source=(const char *)source;
   1147     pArgs->target=target;
   1148     pArgs->offsets=offsets;
   1149     return;
   1150 }
   1151 
   1152 /*
   1153  * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
   1154  * If a change is made in the original function, then either
   1155  * change this function the same way or
   1156  * re-copy the original function and remove the variables
   1157  * offsets, sourceIndex, and nextSourceIndex.
   1158  */
   1159 static void
   1160 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
   1161                 UErrorCode *pErrorCode) {
   1162     UConverter *cnv;
   1163     const uint8_t *source, *sourceLimit;
   1164     UChar *target;
   1165     const UChar *targetLimit;
   1166 
   1167     int32_t prev, count, diff, c;
   1168 
   1169     int8_t byteIndex;
   1170     uint8_t *bytes;
   1171 
   1172     /* set up the local pointers */
   1173     cnv=pArgs->converter;
   1174     source=(const uint8_t *)pArgs->source;
   1175     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   1176     target=pArgs->target;
   1177     targetLimit=pArgs->targetLimit;
   1178 
   1179     /* get the converter state from UConverter */
   1180     prev=(int32_t)cnv->toUnicodeStatus;
   1181     if(prev==0) {
   1182         prev=BOCU1_ASCII_PREV;
   1183     }
   1184     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
   1185     count=diff&3;
   1186     diff>>=2;
   1187 
   1188     byteIndex=cnv->toULength;
   1189     bytes=cnv->toUBytes;
   1190 
   1191     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
   1192     if(count>0 && byteIndex>0 && target<targetLimit) {
   1193         goto getTrail;
   1194     }
   1195 
   1196 fastSingle:
   1197     /* fast loop for single-byte differences */
   1198     /* use count as the only loop counter variable */
   1199     diff=(int32_t)(sourceLimit-source);
   1200     count=(int32_t)(pArgs->targetLimit-target);
   1201     if(count>diff) {
   1202         count=diff;
   1203     }
   1204     while(count>0) {
   1205         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
   1206             c=prev+(c-BOCU1_MIDDLE);
   1207             if(c<0x3000) {
   1208                 *target++=(UChar)c;
   1209                 prev=BOCU1_SIMPLE_PREV(c);
   1210             } else {
   1211                 break;
   1212             }
   1213         } else if(c<=0x20) {
   1214             if(c!=0x20) {
   1215                 prev=BOCU1_ASCII_PREV;
   1216             }
   1217             *target++=(UChar)c;
   1218         } else {
   1219             break;
   1220         }
   1221         ++source;
   1222         --count;
   1223     }
   1224 
   1225     /* decode a sequence of single and lead bytes */
   1226     while(source<sourceLimit) {
   1227         if(target>=targetLimit) {
   1228             /* target is full */
   1229             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1230             break;
   1231         }
   1232 
   1233         c=*source++;
   1234         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
   1235             /* Write a code point directly from a single-byte difference. */
   1236             c=prev+(c-BOCU1_MIDDLE);
   1237             if(c<0x3000) {
   1238                 *target++=(UChar)c;
   1239                 prev=BOCU1_SIMPLE_PREV(c);
   1240                 goto fastSingle;
   1241             }
   1242         } else if(c<=0x20) {
   1243             /*
   1244              * Direct-encoded C0 control code or space.
   1245              * Reset prev for C0 control codes but not for space.
   1246              */
   1247             if(c!=0x20) {
   1248                 prev=BOCU1_ASCII_PREV;
   1249             }
   1250             *target++=(UChar)c;
   1251             continue;
   1252         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
   1253             /* Optimize two-byte case. */
   1254             if(c>=BOCU1_MIDDLE) {
   1255                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
   1256             } else {
   1257                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
   1258             }
   1259 
   1260             /* trail byte */
   1261             c=decodeBocu1TrailByte(1, *source++);
   1262             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
   1263                 bytes[0]=source[-2];
   1264                 bytes[1]=source[-1];
   1265                 byteIndex=2;
   1266                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1267                 break;
   1268             }
   1269         } else if(c==BOCU1_RESET) {
   1270             /* only reset the state, no code point */
   1271             prev=BOCU1_ASCII_PREV;
   1272             continue;
   1273         } else {
   1274             /*
   1275              * For multi-byte difference lead bytes, set the decoder state
   1276              * with the partial difference value from the lead byte and
   1277              * with the number of trail bytes.
   1278              */
   1279             bytes[0]=(uint8_t)c;
   1280             byteIndex=1;
   1281 
   1282             diff=decodeBocu1LeadByte(c);
   1283             count=diff&3;
   1284             diff>>=2;
   1285 getTrail:
   1286             for(;;) {
   1287                 if(source>=sourceLimit) {
   1288                     goto endloop;
   1289                 }
   1290                 c=bytes[byteIndex++]=*source++;
   1291 
   1292                 /* trail byte in any position */
   1293                 c=decodeBocu1TrailByte(count, c);
   1294                 if(c<0) {
   1295                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1296                     goto endloop;
   1297                 }
   1298 
   1299                 diff+=c;
   1300                 if(--count==0) {
   1301                     /* final trail byte, deliver a code point */
   1302                     byteIndex=0;
   1303                     c=prev+diff;
   1304                     if((uint32_t)c>0x10ffff) {
   1305                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1306                         goto endloop;
   1307                     }
   1308                     break;
   1309                 }
   1310             }
   1311         }
   1312 
   1313         /* calculate the next prev and output c */
   1314         prev=BOCU1_PREV(c);
   1315         if(c<=0xffff) {
   1316             *target++=(UChar)c;
   1317         } else {
   1318             /* output surrogate pair */
   1319             *target++=U16_LEAD(c);
   1320             if(target<targetLimit) {
   1321                 *target++=U16_TRAIL(c);
   1322             } else {
   1323                 /* target overflow */
   1324                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
   1325                 cnv->UCharErrorBufferLength=1;
   1326                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1327                 break;
   1328             }
   1329         }
   1330     }
   1331 endloop:
   1332 
   1333     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
   1334         /* set the converter state in UConverter to deal with the next character */
   1335         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
   1336         cnv->mode=0;
   1337     } else {
   1338         /* set the converter state back into UConverter */
   1339         cnv->toUnicodeStatus=(uint32_t)prev;
   1340         cnv->mode=(diff<<2)|count;
   1341     }
   1342     cnv->toULength=byteIndex;
   1343 
   1344     /* write back the updated pointers */
   1345     pArgs->source=(const char *)source;
   1346     pArgs->target=target;
   1347     return;
   1348 }
   1349 
   1350 /* miscellaneous ------------------------------------------------------------ */
   1351 
   1352 static const UConverterImpl _Bocu1Impl={
   1353     UCNV_BOCU1,
   1354 
   1355     NULL,
   1356     NULL,
   1357 
   1358     NULL,
   1359     NULL,
   1360     NULL,
   1361 
   1362     _Bocu1ToUnicode,
   1363     _Bocu1ToUnicodeWithOffsets,
   1364     _Bocu1FromUnicode,
   1365     _Bocu1FromUnicodeWithOffsets,
   1366     NULL,
   1367 
   1368     NULL,
   1369     NULL,
   1370     NULL,
   1371     NULL,
   1372     ucnv_getCompleteUnicodeSet,
   1373 
   1374     NULL,
   1375     NULL
   1376 };
   1377 
   1378 static const UConverterStaticData _Bocu1StaticData={
   1379     sizeof(UConverterStaticData),
   1380     "BOCU-1",
   1381     1214, /* CCSID for BOCU-1 */
   1382     UCNV_IBM, UCNV_BOCU1,
   1383     1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
   1384     { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
   1385     FALSE, FALSE,
   1386     0,
   1387     0,
   1388     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1389 };
   1390 
   1391 const UConverterSharedData _Bocu1Data=
   1392         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
   1393 
   1394 #endif
   1395