Home | History | Annotate | Download | only in common
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 2002-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *   file name:  ucnvbocu.cpp
     11 *   encoding:   US-ASCII
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2002mar27
     16 *   created by: Markus W. Scherer
     17 *
     18 *   This is an implementation of the Binary Ordered Compression for Unicode,
     19 *   in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/
     20 */
     21 
     22 #include "unicode/utypes.h"
     23 
     24 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION
     25 
     26 #include "unicode/ucnv.h"
     27 #include "unicode/ucnv_cb.h"
     28 #include "unicode/utf16.h"
     29 #include "putilimp.h"
     30 #include "ucnv_bld.h"
     31 #include "ucnv_cnv.h"
     32 #include "uassert.h"
     33 
     34 /* BOCU-1 constants and macros ---------------------------------------------- */
     35 
     36 /*
     37  * BOCU-1 encodes the code points of a Unicode string as
     38  * a sequence of byte-encoded differences (slope detection),
     39  * preserving lexical order.
     40  *
     41  * Optimize the difference-taking for runs of Unicode text within
     42  * small scripts:
     43  *
     44  * Most small scripts are allocated within aligned 128-blocks of Unicode
     45  * code points. Lexical order is preserved if the "previous code point" state
     46  * is always moved into the middle of such a block.
     47  *
     48  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
     49  * areas into the middle of those areas.
     50  *
     51  * C0 control codes and space are encoded with their US-ASCII bytes.
     52  * "prev" is reset for C0 controls but not for space.
     53  */
     54 
     55 /* initial value for "prev": middle of the ASCII range */
     56 #define BOCU1_ASCII_PREV        0x40
     57 
     58 /* bounding byte values for differences */
     59 #define BOCU1_MIN               0x21
     60 #define BOCU1_MIDDLE            0x90
     61 #define BOCU1_MAX_LEAD          0xfe
     62 #define BOCU1_MAX_TRAIL         0xff
     63 #define BOCU1_RESET             0xff
     64 
     65 /* number of lead bytes */
     66 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
     67 
     68 /* adjust trail byte counts for the use of some C0 control byte values */
     69 #define BOCU1_TRAIL_CONTROLS_COUNT  20
     70 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
     71 
     72 /* number of trail bytes */
     73 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
     74 
     75 /*
     76  * number of positive and negative single-byte codes
     77  * (counting 0==BOCU1_MIDDLE among the positive ones)
     78  */
     79 #define BOCU1_SINGLE            64
     80 
     81 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
     82 #define BOCU1_LEAD_2            43
     83 #define BOCU1_LEAD_3            3
     84 #define BOCU1_LEAD_4            1
     85 
     86 /* The difference value range for single-byters. */
     87 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
     88 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
     89 
     90 /* The difference value range for double-byters. */
     91 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
     92 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
     93 
     94 /* The difference value range for 3-byters. */
     95 #define BOCU1_REACH_POS_3   \
     96     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
     97 
     98 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
     99 
    100 /* The lead byte start values. */
    101 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
    102 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
    103 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
    104      /* ==BOCU1_MAX_LEAD */
    105 
    106 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
    107 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
    108 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
    109      /* ==BOCU1_MIN+1 */
    110 
    111 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
    112 #define BOCU1_LENGTH_FROM_LEAD(lead) \
    113     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
    114      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
    115      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
    116 
    117 /* The length of a byte sequence, according to its packed form. */
    118 #define BOCU1_LENGTH_FROM_PACKED(packed) \
    119     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
    120 
    121 /*
    122  * 12 commonly used C0 control codes (and space) are only used to encode
    123  * themselves directly,
    124  * which makes BOCU-1 MIME-usable and reasonably safe for
    125  * ASCII-oriented software.
    126  *
    127  * These controls are
    128  *  0   NUL
    129  *
    130  *  7   BEL
    131  *  8   BS
    132  *
    133  *  9   TAB
    134  *  a   LF
    135  *  b   VT
    136  *  c   FF
    137  *  d   CR
    138  *
    139  *  e   SO
    140  *  f   SI
    141  *
    142  * 1a   SUB
    143  * 1b   ESC
    144  *
    145  * The other 20 C0 controls are also encoded directly (to preserve order)
    146  * but are also used as trail bytes in difference encoding
    147  * (for better compression).
    148  */
    149 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
    150 
    151 /*
    152  * Byte value map for control codes,
    153  * from external byte values 0x00..0x20
    154  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
    155  * External byte values that are illegal as trail bytes are mapped to -1.
    156  */
    157 static const int8_t
    158 bocu1ByteToTrail[BOCU1_MIN]={
    159 /*  0     1     2     3     4     5     6     7    */
    160     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
    161 
    162 /*  8     9     a     b     c     d     e     f    */
    163     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
    164 
    165 /*  10    11    12    13    14    15    16    17   */
    166     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
    167 
    168 /*  18    19    1a    1b    1c    1d    1e    1f   */
    169     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
    170 
    171 /*  20   */
    172     -1
    173 };
    174 
    175 /*
    176  * Byte value map for control codes,
    177  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
    178  * to external byte values 0x00..0x20.
    179  */
    180 static const int8_t
    181 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
    182 /*  0     1     2     3     4     5     6     7    */
    183     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
    184 
    185 /*  8     9     a     b     c     d     e     f    */
    186     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
    187 
    188 /*  10    11    12    13   */
    189     0x1c, 0x1d, 0x1e, 0x1f
    190 };
    191 
    192 /**
    193  * Integer division and modulo with negative numerators
    194  * yields negative modulo results and quotients that are one more than
    195  * what we need here.
    196  * This macro adjust the results so that the modulo-value m is always >=0.
    197  *
    198  * For positive n, the if() condition is always FALSE.
    199  *
    200  * @param n Number to be split into quotient and rest.
    201  *          Will be modified to contain the quotient.
    202  * @param d Divisor.
    203  * @param m Output variable for the rest (modulo result).
    204  */
    205 #define NEGDIVMOD(n, d, m) { \
    206     (m)=(n)%(d); \
    207     (n)/=(d); \
    208     if((m)<0) { \
    209         --(n); \
    210         (m)+=(d); \
    211     } \
    212 }
    213 
    214 /* Faster versions of packDiff() for single-byte-encoded diff values. */
    215 
    216 /** Is a diff value encodable in a single byte? */
    217 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1)
    218 
    219 /** Encode a diff value in a single byte. */
    220 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff))
    221 
    222 /** Is a diff value encodable in two bytes? */
    223 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2)
    224 
    225 /* BOCU-1 implementation functions ------------------------------------------ */
    226 
    227 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV)
    228 
    229 /**
    230  * Compute the next "previous" value for differencing
    231  * from the current code point.
    232  *
    233  * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below)
    234  * @return "previous code point" state value
    235  */
    236 static inline int32_t
    237 bocu1Prev(int32_t c) {
    238     /* compute new prev */
    239     if(/* 0x3040<=c && */ c<=0x309f) {
    240         /* Hiragana is not 128-aligned */
    241         return 0x3070;
    242     } else if(0x4e00<=c && c<=0x9fa5) {
    243         /* CJK Unihan */
    244         return 0x4e00-BOCU1_REACH_NEG_2;
    245     } else if(0xac00<=c /* && c<=0xd7a3 */) {
    246         /* Korean Hangul */
    247         return (0xd7a3+0xac00)/2;
    248     } else {
    249         /* mostly small scripts */
    250         return BOCU1_SIMPLE_PREV(c);
    251     }
    252 }
    253 
    254 /** Fast version of bocu1Prev() for most scripts. */
    255 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c))
    256 
    257 /*
    258  * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c.
    259  * The UConverter fields are used as follows:
    260  *
    261  * fromUnicodeStatus    encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
    262  *
    263  * toUnicodeStatus      decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV)
    264  * mode                 decoder's incomplete (diff<<2)|count (ignored when toULength==0)
    265  */
    266 
    267 /* BOCU-1-from-Unicode conversion functions --------------------------------- */
    268 
    269 /**
    270  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
    271  * and return a packed integer with them.
    272  *
    273  * The encoding favors small absolute differences with short encodings
    274  * to compress runs of same-script characters.
    275  *
    276  * Optimized version with unrolled loops and fewer floating-point operations
    277  * than the standard packDiff().
    278  *
    279  * @param diff difference value -0x10ffff..0x10ffff
    280  * @return
    281  *      0x010000zz for 1-byte sequence zz
    282  *      0x0200yyzz for 2-byte sequence yy zz
    283  *      0x03xxyyzz for 3-byte sequence xx yy zz
    284  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
    285  */
    286 static int32_t
    287 packDiff(int32_t diff) {
    288     int32_t result, m;
    289 
    290     U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */
    291     if(diff>=BOCU1_REACH_NEG_1) {
    292         /* mostly positive differences, and single-byte negative ones */
    293 #if 0   /* single-byte case handled in macros, see below */
    294         if(diff<=BOCU1_REACH_POS_1) {
    295             /* single byte */
    296             return 0x01000000|(BOCU1_MIDDLE+diff);
    297         } else
    298 #endif
    299         if(diff<=BOCU1_REACH_POS_2) {
    300             /* two bytes */
    301             diff-=BOCU1_REACH_POS_1+1;
    302             result=0x02000000;
    303 
    304             m=diff%BOCU1_TRAIL_COUNT;
    305             diff/=BOCU1_TRAIL_COUNT;
    306             result|=BOCU1_TRAIL_TO_BYTE(m);
    307 
    308             result|=(BOCU1_START_POS_2+diff)<<8;
    309         } else if(diff<=BOCU1_REACH_POS_3) {
    310             /* three bytes */
    311             diff-=BOCU1_REACH_POS_2+1;
    312             result=0x03000000;
    313 
    314             m=diff%BOCU1_TRAIL_COUNT;
    315             diff/=BOCU1_TRAIL_COUNT;
    316             result|=BOCU1_TRAIL_TO_BYTE(m);
    317 
    318             m=diff%BOCU1_TRAIL_COUNT;
    319             diff/=BOCU1_TRAIL_COUNT;
    320             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    321 
    322             result|=(BOCU1_START_POS_3+diff)<<16;
    323         } else {
    324             /* four bytes */
    325             diff-=BOCU1_REACH_POS_3+1;
    326 
    327             m=diff%BOCU1_TRAIL_COUNT;
    328             diff/=BOCU1_TRAIL_COUNT;
    329             result=BOCU1_TRAIL_TO_BYTE(m);
    330 
    331             m=diff%BOCU1_TRAIL_COUNT;
    332             diff/=BOCU1_TRAIL_COUNT;
    333             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    334 
    335             /*
    336              * We know that / and % would deliver quotient 0 and rest=diff.
    337              * Avoid division and modulo for performance.
    338              */
    339             result|=BOCU1_TRAIL_TO_BYTE(diff)<<16;
    340 
    341             result|=((uint32_t)BOCU1_START_POS_4)<<24;
    342         }
    343     } else {
    344         /* two- to four-byte negative differences */
    345         if(diff>=BOCU1_REACH_NEG_2) {
    346             /* two bytes */
    347             diff-=BOCU1_REACH_NEG_1;
    348             result=0x02000000;
    349 
    350             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    351             result|=BOCU1_TRAIL_TO_BYTE(m);
    352 
    353             result|=(BOCU1_START_NEG_2+diff)<<8;
    354         } else if(diff>=BOCU1_REACH_NEG_3) {
    355             /* three bytes */
    356             diff-=BOCU1_REACH_NEG_2;
    357             result=0x03000000;
    358 
    359             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    360             result|=BOCU1_TRAIL_TO_BYTE(m);
    361 
    362             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    363             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    364 
    365             result|=(BOCU1_START_NEG_3+diff)<<16;
    366         } else {
    367             /* four bytes */
    368             diff-=BOCU1_REACH_NEG_3;
    369 
    370             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    371             result=BOCU1_TRAIL_TO_BYTE(m);
    372 
    373             NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    374             result|=BOCU1_TRAIL_TO_BYTE(m)<<8;
    375 
    376             /*
    377              * We know that NEGDIVMOD would deliver
    378              * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT.
    379              * Avoid division and modulo for performance.
    380              */
    381             m=diff+BOCU1_TRAIL_COUNT;
    382             result|=BOCU1_TRAIL_TO_BYTE(m)<<16;
    383 
    384             result|=BOCU1_MIN<<24;
    385         }
    386     }
    387     return result;
    388 }
    389 
    390 
    391 static void U_CALLCONV
    392 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs,
    393                              UErrorCode *pErrorCode) {
    394     UConverter *cnv;
    395     const UChar *source, *sourceLimit;
    396     uint8_t *target;
    397     int32_t targetCapacity;
    398     int32_t *offsets;
    399 
    400     int32_t prev, c, diff;
    401 
    402     int32_t sourceIndex, nextSourceIndex;
    403 
    404     /* set up the local pointers */
    405     cnv=pArgs->converter;
    406     source=pArgs->source;
    407     sourceLimit=pArgs->sourceLimit;
    408     target=(uint8_t *)pArgs->target;
    409     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
    410     offsets=pArgs->offsets;
    411 
    412     /* get the converter state from UConverter */
    413     c=cnv->fromUChar32;
    414     prev=(int32_t)cnv->fromUnicodeStatus;
    415     if(prev==0) {
    416         prev=BOCU1_ASCII_PREV;
    417     }
    418 
    419     /* sourceIndex=-1 if the current character began in the previous buffer */
    420     sourceIndex= c==0 ? 0 : -1;
    421     nextSourceIndex=0;
    422 
    423     /* conversion loop */
    424     if(c!=0 && targetCapacity>0) {
    425         goto getTrail;
    426     }
    427 
    428 fastSingle:
    429     /* fast loop for single-byte differences */
    430     /* use only one loop counter variable, targetCapacity, not also source */
    431     diff=(int32_t)(sourceLimit-source);
    432     if(targetCapacity>diff) {
    433         targetCapacity=diff;
    434     }
    435     while(targetCapacity>0 && (c=*source)<0x3000) {
    436         if(c<=0x20) {
    437             if(c!=0x20) {
    438                 prev=BOCU1_ASCII_PREV;
    439             }
    440             *target++=(uint8_t)c;
    441             *offsets++=nextSourceIndex++;
    442             ++source;
    443             --targetCapacity;
    444         } else {
    445             diff=c-prev;
    446             if(DIFF_IS_SINGLE(diff)) {
    447                 prev=BOCU1_SIMPLE_PREV(c);
    448                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
    449                 *offsets++=nextSourceIndex++;
    450                 ++source;
    451                 --targetCapacity;
    452             } else {
    453                 break;
    454             }
    455         }
    456     }
    457     /* restore real values */
    458     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
    459     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
    460 
    461     /* regular loop for all cases */
    462     while(source<sourceLimit) {
    463         if(targetCapacity>0) {
    464             c=*source++;
    465             ++nextSourceIndex;
    466 
    467             if(c<=0x20) {
    468                 /*
    469                  * ISO C0 control & space:
    470                  * Encode directly for MIME compatibility,
    471                  * and reset state except for space, to not disrupt compression.
    472                  */
    473                 if(c!=0x20) {
    474                     prev=BOCU1_ASCII_PREV;
    475                 }
    476                 *target++=(uint8_t)c;
    477                 *offsets++=sourceIndex;
    478                 --targetCapacity;
    479 
    480                 sourceIndex=nextSourceIndex;
    481                 continue;
    482             }
    483 
    484             if(U16_IS_LEAD(c)) {
    485 getTrail:
    486                 if(source<sourceLimit) {
    487                     /* test the following code unit */
    488                     UChar trail=*source;
    489                     if(U16_IS_TRAIL(trail)) {
    490                         ++source;
    491                         ++nextSourceIndex;
    492                         c=U16_GET_SUPPLEMENTARY(c, trail);
    493                     }
    494                 } else {
    495                     /* no more input */
    496                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
    497                     break;
    498                 }
    499             }
    500 
    501             /*
    502              * all other Unicode code points c==U+0021..U+10ffff
    503              * are encoded with the difference c-prev
    504              *
    505              * a new prev is computed from c,
    506              * placed in the middle of a 0x80-block (for most small scripts) or
    507              * in the middle of the Unihan and Hangul blocks
    508              * to statistically minimize the following difference
    509              */
    510             diff=c-prev;
    511             prev=BOCU1_PREV(c);
    512             if(DIFF_IS_SINGLE(diff)) {
    513                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
    514                 *offsets++=sourceIndex;
    515                 --targetCapacity;
    516                 sourceIndex=nextSourceIndex;
    517                 if(c<0x3000) {
    518                     goto fastSingle;
    519                 }
    520             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
    521                 /* optimize 2-byte case */
    522                 int32_t m;
    523 
    524                 if(diff>=0) {
    525                     diff-=BOCU1_REACH_POS_1+1;
    526                     m=diff%BOCU1_TRAIL_COUNT;
    527                     diff/=BOCU1_TRAIL_COUNT;
    528                     diff+=BOCU1_START_POS_2;
    529                 } else {
    530                     diff-=BOCU1_REACH_NEG_1;
    531                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    532                     diff+=BOCU1_START_NEG_2;
    533                 }
    534                 *target++=(uint8_t)diff;
    535                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
    536                 *offsets++=sourceIndex;
    537                 *offsets++=sourceIndex;
    538                 targetCapacity-=2;
    539                 sourceIndex=nextSourceIndex;
    540             } else {
    541                 int32_t length; /* will be 2..4 */
    542 
    543                 diff=packDiff(diff);
    544                 length=BOCU1_LENGTH_FROM_PACKED(diff);
    545 
    546                 /* write the output character bytes from diff and length */
    547                 /* from the first if in the loop we know that targetCapacity>0 */
    548                 if(length<=targetCapacity) {
    549                     switch(length) {
    550                         /* each branch falls through to the next one */
    551                     case 4:
    552                         *target++=(uint8_t)(diff>>24);
    553                         *offsets++=sourceIndex;
    554                         U_FALLTHROUGH;
    555                     case 3:
    556                         *target++=(uint8_t)(diff>>16);
    557                         *offsets++=sourceIndex;
    558                         U_FALLTHROUGH;
    559                     case 2:
    560                         *target++=(uint8_t)(diff>>8);
    561                         *offsets++=sourceIndex;
    562                     /* case 1: handled above */
    563                         *target++=(uint8_t)diff;
    564                         *offsets++=sourceIndex;
    565                         U_FALLTHROUGH;
    566                     default:
    567                         /* will never occur */
    568                         break;
    569                     }
    570                     targetCapacity-=length;
    571                     sourceIndex=nextSourceIndex;
    572                 } else {
    573                     uint8_t *charErrorBuffer;
    574 
    575                     /*
    576                      * We actually do this backwards here:
    577                      * In order to save an intermediate variable, we output
    578                      * first to the overflow buffer what does not fit into the
    579                      * regular target.
    580                      */
    581                     /* we know that 1<=targetCapacity<length<=4 */
    582                     length-=targetCapacity;
    583                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
    584                     switch(length) {
    585                         /* each branch falls through to the next one */
    586                     case 3:
    587                         *charErrorBuffer++=(uint8_t)(diff>>16);
    588                         U_FALLTHROUGH;
    589                     case 2:
    590                         *charErrorBuffer++=(uint8_t)(diff>>8);
    591                         U_FALLTHROUGH;
    592                     case 1:
    593                         *charErrorBuffer=(uint8_t)diff;
    594                         U_FALLTHROUGH;
    595                     default:
    596                         /* will never occur */
    597                         break;
    598                     }
    599                     cnv->charErrorBufferLength=(int8_t)length;
    600 
    601                     /* now output what fits into the regular target */
    602                     diff>>=8*length; /* length was reduced by targetCapacity */
    603                     switch(targetCapacity) {
    604                         /* each branch falls through to the next one */
    605                     case 3:
    606                         *target++=(uint8_t)(diff>>16);
    607                         *offsets++=sourceIndex;
    608                         U_FALLTHROUGH;
    609                     case 2:
    610                         *target++=(uint8_t)(diff>>8);
    611                         *offsets++=sourceIndex;
    612                         U_FALLTHROUGH;
    613                     case 1:
    614                         *target++=(uint8_t)diff;
    615                         *offsets++=sourceIndex;
    616                         U_FALLTHROUGH;
    617                     default:
    618                         /* will never occur */
    619                         break;
    620                     }
    621 
    622                     /* target overflow */
    623                     targetCapacity=0;
    624                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    625                     break;
    626                 }
    627             }
    628         } else {
    629             /* target is full */
    630             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    631             break;
    632         }
    633     }
    634 
    635     /* set the converter state back into UConverter */
    636     cnv->fromUChar32= c<0 ? -c : 0;
    637     cnv->fromUnicodeStatus=(uint32_t)prev;
    638 
    639     /* write back the updated pointers */
    640     pArgs->source=source;
    641     pArgs->target=(char *)target;
    642     pArgs->offsets=offsets;
    643 }
    644 
    645 /*
    646  * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling.
    647  * If a change is made in the original function, then either
    648  * change this function the same way or
    649  * re-copy the original function and remove the variables
    650  * offsets, sourceIndex, and nextSourceIndex.
    651  */
    652 static void U_CALLCONV
    653 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs,
    654                   UErrorCode *pErrorCode) {
    655     UConverter *cnv;
    656     const UChar *source, *sourceLimit;
    657     uint8_t *target;
    658     int32_t targetCapacity;
    659 
    660     int32_t prev, c, diff;
    661 
    662     /* set up the local pointers */
    663     cnv=pArgs->converter;
    664     source=pArgs->source;
    665     sourceLimit=pArgs->sourceLimit;
    666     target=(uint8_t *)pArgs->target;
    667     targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target);
    668 
    669     /* get the converter state from UConverter */
    670     c=cnv->fromUChar32;
    671     prev=(int32_t)cnv->fromUnicodeStatus;
    672     if(prev==0) {
    673         prev=BOCU1_ASCII_PREV;
    674     }
    675 
    676     /* conversion loop */
    677     if(c!=0 && targetCapacity>0) {
    678         goto getTrail;
    679     }
    680 
    681 fastSingle:
    682     /* fast loop for single-byte differences */
    683     /* use only one loop counter variable, targetCapacity, not also source */
    684     diff=(int32_t)(sourceLimit-source);
    685     if(targetCapacity>diff) {
    686         targetCapacity=diff;
    687     }
    688     while(targetCapacity>0 && (c=*source)<0x3000) {
    689         if(c<=0x20) {
    690             if(c!=0x20) {
    691                 prev=BOCU1_ASCII_PREV;
    692             }
    693             *target++=(uint8_t)c;
    694         } else {
    695             diff=c-prev;
    696             if(DIFF_IS_SINGLE(diff)) {
    697                 prev=BOCU1_SIMPLE_PREV(c);
    698                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
    699             } else {
    700                 break;
    701             }
    702         }
    703         ++source;
    704         --targetCapacity;
    705     }
    706     /* restore real values */
    707     targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target);
    708 
    709     /* regular loop for all cases */
    710     while(source<sourceLimit) {
    711         if(targetCapacity>0) {
    712             c=*source++;
    713 
    714             if(c<=0x20) {
    715                 /*
    716                  * ISO C0 control & space:
    717                  * Encode directly for MIME compatibility,
    718                  * and reset state except for space, to not disrupt compression.
    719                  */
    720                 if(c!=0x20) {
    721                     prev=BOCU1_ASCII_PREV;
    722                 }
    723                 *target++=(uint8_t)c;
    724                 --targetCapacity;
    725                 continue;
    726             }
    727 
    728             if(U16_IS_LEAD(c)) {
    729 getTrail:
    730                 if(source<sourceLimit) {
    731                     /* test the following code unit */
    732                     UChar trail=*source;
    733                     if(U16_IS_TRAIL(trail)) {
    734                         ++source;
    735                         c=U16_GET_SUPPLEMENTARY(c, trail);
    736                     }
    737                 } else {
    738                     /* no more input */
    739                     c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */
    740                     break;
    741                 }
    742             }
    743 
    744             /*
    745              * all other Unicode code points c==U+0021..U+10ffff
    746              * are encoded with the difference c-prev
    747              *
    748              * a new prev is computed from c,
    749              * placed in the middle of a 0x80-block (for most small scripts) or
    750              * in the middle of the Unihan and Hangul blocks
    751              * to statistically minimize the following difference
    752              */
    753             diff=c-prev;
    754             prev=BOCU1_PREV(c);
    755             if(DIFF_IS_SINGLE(diff)) {
    756                 *target++=(uint8_t)PACK_SINGLE_DIFF(diff);
    757                 --targetCapacity;
    758                 if(c<0x3000) {
    759                     goto fastSingle;
    760                 }
    761             } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) {
    762                 /* optimize 2-byte case */
    763                 int32_t m;
    764 
    765                 if(diff>=0) {
    766                     diff-=BOCU1_REACH_POS_1+1;
    767                     m=diff%BOCU1_TRAIL_COUNT;
    768                     diff/=BOCU1_TRAIL_COUNT;
    769                     diff+=BOCU1_START_POS_2;
    770                 } else {
    771                     diff-=BOCU1_REACH_NEG_1;
    772                     NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    773                     diff+=BOCU1_START_NEG_2;
    774                 }
    775                 *target++=(uint8_t)diff;
    776                 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m);
    777                 targetCapacity-=2;
    778             } else {
    779                 int32_t length; /* will be 2..4 */
    780 
    781                 diff=packDiff(diff);
    782                 length=BOCU1_LENGTH_FROM_PACKED(diff);
    783 
    784                 /* write the output character bytes from diff and length */
    785                 /* from the first if in the loop we know that targetCapacity>0 */
    786                 if(length<=targetCapacity) {
    787                     switch(length) {
    788                         /* each branch falls through to the next one */
    789                     case 4:
    790                         *target++=(uint8_t)(diff>>24);
    791                         U_FALLTHROUGH;
    792                     case 3:
    793                         *target++=(uint8_t)(diff>>16);
    794                     /* case 2: handled above */
    795                         *target++=(uint8_t)(diff>>8);
    796                     /* case 1: handled above */
    797                         *target++=(uint8_t)diff;
    798                         U_FALLTHROUGH;
    799                     default:
    800                         /* will never occur */
    801                         break;
    802                     }
    803                     targetCapacity-=length;
    804                 } else {
    805                     uint8_t *charErrorBuffer;
    806 
    807                     /*
    808                      * We actually do this backwards here:
    809                      * In order to save an intermediate variable, we output
    810                      * first to the overflow buffer what does not fit into the
    811                      * regular target.
    812                      */
    813                     /* we know that 1<=targetCapacity<length<=4 */
    814                     length-=targetCapacity;
    815                     charErrorBuffer=(uint8_t *)cnv->charErrorBuffer;
    816                     switch(length) {
    817                         /* each branch falls through to the next one */
    818                     case 3:
    819                         *charErrorBuffer++=(uint8_t)(diff>>16);
    820                         U_FALLTHROUGH;
    821                     case 2:
    822                         *charErrorBuffer++=(uint8_t)(diff>>8);
    823                         U_FALLTHROUGH;
    824                     case 1:
    825                         *charErrorBuffer=(uint8_t)diff;
    826                         U_FALLTHROUGH;
    827                     default:
    828                         /* will never occur */
    829                         break;
    830                     }
    831                     cnv->charErrorBufferLength=(int8_t)length;
    832 
    833                     /* now output what fits into the regular target */
    834                     diff>>=8*length; /* length was reduced by targetCapacity */
    835                     switch(targetCapacity) {
    836                         /* each branch falls through to the next one */
    837                     case 3:
    838                         *target++=(uint8_t)(diff>>16);
    839                         U_FALLTHROUGH;
    840                     case 2:
    841                         *target++=(uint8_t)(diff>>8);
    842                         U_FALLTHROUGH;
    843                     case 1:
    844                         *target++=(uint8_t)diff;
    845                         U_FALLTHROUGH;
    846                     default:
    847                         /* will never occur */
    848                         break;
    849                     }
    850 
    851                     /* target overflow */
    852                     targetCapacity=0;
    853                     *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    854                     break;
    855                 }
    856             }
    857         } else {
    858             /* target is full */
    859             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    860             break;
    861         }
    862     }
    863 
    864     /* set the converter state back into UConverter */
    865     cnv->fromUChar32= c<0 ? -c : 0;
    866     cnv->fromUnicodeStatus=(uint32_t)prev;
    867 
    868     /* write back the updated pointers */
    869     pArgs->source=source;
    870     pArgs->target=(char *)target;
    871 }
    872 
    873 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */
    874 
    875 /**
    876  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
    877  *
    878  * @param b lead byte;
    879  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD
    880  * @return (diff<<2)|count
    881  */
    882 static inline int32_t
    883 decodeBocu1LeadByte(int32_t b) {
    884     int32_t diff, count;
    885 
    886     if(b>=BOCU1_START_NEG_2) {
    887         /* positive difference */
    888         if(b<BOCU1_START_POS_3) {
    889             /* two bytes */
    890             diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
    891             count=1;
    892         } else if(b<BOCU1_START_POS_4) {
    893             /* three bytes */
    894             diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
    895             count=2;
    896         } else {
    897             /* four bytes */
    898             diff=BOCU1_REACH_POS_3+1;
    899             count=3;
    900         }
    901     } else {
    902         /* negative difference */
    903         if(b>=BOCU1_START_NEG_3) {
    904             /* two bytes */
    905             diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
    906             count=1;
    907         } else if(b>BOCU1_MIN) {
    908             /* three bytes */
    909             diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
    910             count=2;
    911         } else {
    912             /* four bytes */
    913             diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
    914             count=3;
    915         }
    916     }
    917 
    918     /* return the state for decoding the trail byte(s) */
    919     return (diff<<2)|count;
    920 }
    921 
    922 /**
    923  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
    924  *
    925  * @param count number of remaining trail bytes including this one
    926  * @param b trail byte
    927  * @return new delta for diff including b - <0 indicates an error
    928  *
    929  * @see decodeBocu1
    930  */
    931 static inline int32_t
    932 decodeBocu1TrailByte(int32_t count, int32_t b) {
    933     if(b<=0x20) {
    934         /* skip some C0 controls and make the trail byte range contiguous */
    935         b=bocu1ByteToTrail[b];
    936         /* b<0 for an illegal trail byte value will result in return<0 below */
    937 #if BOCU1_MAX_TRAIL<0xff
    938     } else if(b>BOCU1_MAX_TRAIL) {
    939         return -99;
    940 #endif
    941     } else {
    942         b-=BOCU1_TRAIL_BYTE_OFFSET;
    943     }
    944 
    945     /* add trail byte into difference and decrement count */
    946     if(count==1) {
    947         return b;
    948     } else if(count==2) {
    949         return b*BOCU1_TRAIL_COUNT;
    950     } else /* count==3 */ {
    951         return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT);
    952     }
    953 }
    954 
    955 static void U_CALLCONV
    956 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs,
    957                            UErrorCode *pErrorCode) {
    958     UConverter *cnv;
    959     const uint8_t *source, *sourceLimit;
    960     UChar *target;
    961     const UChar *targetLimit;
    962     int32_t *offsets;
    963 
    964     int32_t prev, count, diff, c;
    965 
    966     int8_t byteIndex;
    967     uint8_t *bytes;
    968 
    969     int32_t sourceIndex, nextSourceIndex;
    970 
    971     /* set up the local pointers */
    972     cnv=pArgs->converter;
    973     source=(const uint8_t *)pArgs->source;
    974     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
    975     target=pArgs->target;
    976     targetLimit=pArgs->targetLimit;
    977     offsets=pArgs->offsets;
    978 
    979     /* get the converter state from UConverter */
    980     prev=(int32_t)cnv->toUnicodeStatus;
    981     if(prev==0) {
    982         prev=BOCU1_ASCII_PREV;
    983     }
    984     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
    985     count=diff&3;
    986     diff>>=2;
    987 
    988     byteIndex=cnv->toULength;
    989     bytes=cnv->toUBytes;
    990 
    991     /* sourceIndex=-1 if the current character began in the previous buffer */
    992     sourceIndex=byteIndex==0 ? 0 : -1;
    993     nextSourceIndex=0;
    994 
    995     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
    996     if(count>0 && byteIndex>0 && target<targetLimit) {
    997         goto getTrail;
    998     }
    999 
   1000 fastSingle:
   1001     /* fast loop for single-byte differences */
   1002     /* use count as the only loop counter variable */
   1003     diff=(int32_t)(sourceLimit-source);
   1004     count=(int32_t)(pArgs->targetLimit-target);
   1005     if(count>diff) {
   1006         count=diff;
   1007     }
   1008     while(count>0) {
   1009         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
   1010             c=prev+(c-BOCU1_MIDDLE);
   1011             if(c<0x3000) {
   1012                 *target++=(UChar)c;
   1013                 *offsets++=nextSourceIndex++;
   1014                 prev=BOCU1_SIMPLE_PREV(c);
   1015             } else {
   1016                 break;
   1017             }
   1018         } else if(c<=0x20) {
   1019             if(c!=0x20) {
   1020                 prev=BOCU1_ASCII_PREV;
   1021             }
   1022             *target++=(UChar)c;
   1023             *offsets++=nextSourceIndex++;
   1024         } else {
   1025             break;
   1026         }
   1027         ++source;
   1028         --count;
   1029     }
   1030     sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */
   1031 
   1032     /* decode a sequence of single and lead bytes */
   1033     while(source<sourceLimit) {
   1034         if(target>=targetLimit) {
   1035             /* target is full */
   1036             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1037             break;
   1038         }
   1039 
   1040         ++nextSourceIndex;
   1041         c=*source++;
   1042         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
   1043             /* Write a code point directly from a single-byte difference. */
   1044             c=prev+(c-BOCU1_MIDDLE);
   1045             if(c<0x3000) {
   1046                 *target++=(UChar)c;
   1047                 *offsets++=sourceIndex;
   1048                 prev=BOCU1_SIMPLE_PREV(c);
   1049                 sourceIndex=nextSourceIndex;
   1050                 goto fastSingle;
   1051             }
   1052         } else if(c<=0x20) {
   1053             /*
   1054              * Direct-encoded C0 control code or space.
   1055              * Reset prev for C0 control codes but not for space.
   1056              */
   1057             if(c!=0x20) {
   1058                 prev=BOCU1_ASCII_PREV;
   1059             }
   1060             *target++=(UChar)c;
   1061             *offsets++=sourceIndex;
   1062             sourceIndex=nextSourceIndex;
   1063             continue;
   1064         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
   1065             /* Optimize two-byte case. */
   1066             if(c>=BOCU1_MIDDLE) {
   1067                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
   1068             } else {
   1069                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
   1070             }
   1071 
   1072             /* trail byte */
   1073             ++nextSourceIndex;
   1074             c=decodeBocu1TrailByte(1, *source++);
   1075             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
   1076                 bytes[0]=source[-2];
   1077                 bytes[1]=source[-1];
   1078                 byteIndex=2;
   1079                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1080                 break;
   1081             }
   1082         } else if(c==BOCU1_RESET) {
   1083             /* only reset the state, no code point */
   1084             prev=BOCU1_ASCII_PREV;
   1085             sourceIndex=nextSourceIndex;
   1086             continue;
   1087         } else {
   1088             /*
   1089              * For multi-byte difference lead bytes, set the decoder state
   1090              * with the partial difference value from the lead byte and
   1091              * with the number of trail bytes.
   1092              */
   1093             bytes[0]=(uint8_t)c;
   1094             byteIndex=1;
   1095 
   1096             diff=decodeBocu1LeadByte(c);
   1097             count=diff&3;
   1098             diff>>=2;
   1099 getTrail:
   1100             for(;;) {
   1101                 if(source>=sourceLimit) {
   1102                     goto endloop;
   1103                 }
   1104                 ++nextSourceIndex;
   1105                 c=bytes[byteIndex++]=*source++;
   1106 
   1107                 /* trail byte in any position */
   1108                 c=decodeBocu1TrailByte(count, c);
   1109                 if(c<0) {
   1110                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1111                     goto endloop;
   1112                 }
   1113 
   1114                 diff+=c;
   1115                 if(--count==0) {
   1116                     /* final trail byte, deliver a code point */
   1117                     byteIndex=0;
   1118                     c=prev+diff;
   1119                     if((uint32_t)c>0x10ffff) {
   1120                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1121                         goto endloop;
   1122                     }
   1123                     break;
   1124                 }
   1125             }
   1126         }
   1127 
   1128         /* calculate the next prev and output c */
   1129         prev=BOCU1_PREV(c);
   1130         if(c<=0xffff) {
   1131             *target++=(UChar)c;
   1132             *offsets++=sourceIndex;
   1133         } else {
   1134             /* output surrogate pair */
   1135             *target++=U16_LEAD(c);
   1136             if(target<targetLimit) {
   1137                 *target++=U16_TRAIL(c);
   1138                 *offsets++=sourceIndex;
   1139                 *offsets++=sourceIndex;
   1140             } else {
   1141                 /* target overflow */
   1142                 *offsets++=sourceIndex;
   1143                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
   1144                 cnv->UCharErrorBufferLength=1;
   1145                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1146                 break;
   1147             }
   1148         }
   1149         sourceIndex=nextSourceIndex;
   1150     }
   1151 endloop:
   1152 
   1153     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
   1154         /* set the converter state in UConverter to deal with the next character */
   1155         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
   1156         cnv->mode=0;
   1157     } else {
   1158         /* set the converter state back into UConverter */
   1159         cnv->toUnicodeStatus=(uint32_t)prev;
   1160         cnv->mode=(diff<<2)|count;
   1161     }
   1162     cnv->toULength=byteIndex;
   1163 
   1164     /* write back the updated pointers */
   1165     pArgs->source=(const char *)source;
   1166     pArgs->target=target;
   1167     pArgs->offsets=offsets;
   1168     return;
   1169 }
   1170 
   1171 /*
   1172  * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling.
   1173  * If a change is made in the original function, then either
   1174  * change this function the same way or
   1175  * re-copy the original function and remove the variables
   1176  * offsets, sourceIndex, and nextSourceIndex.
   1177  */
   1178 static void U_CALLCONV
   1179 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs,
   1180                 UErrorCode *pErrorCode) {
   1181     UConverter *cnv;
   1182     const uint8_t *source, *sourceLimit;
   1183     UChar *target;
   1184     const UChar *targetLimit;
   1185 
   1186     int32_t prev, count, diff, c;
   1187 
   1188     int8_t byteIndex;
   1189     uint8_t *bytes;
   1190 
   1191     /* set up the local pointers */
   1192     cnv=pArgs->converter;
   1193     source=(const uint8_t *)pArgs->source;
   1194     sourceLimit=(const uint8_t *)pArgs->sourceLimit;
   1195     target=pArgs->target;
   1196     targetLimit=pArgs->targetLimit;
   1197 
   1198     /* get the converter state from UConverter */
   1199     prev=(int32_t)cnv->toUnicodeStatus;
   1200     if(prev==0) {
   1201         prev=BOCU1_ASCII_PREV;
   1202     }
   1203     diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */
   1204     count=diff&3;
   1205     diff>>=2;
   1206 
   1207     byteIndex=cnv->toULength;
   1208     bytes=cnv->toUBytes;
   1209 
   1210     /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */
   1211     if(count>0 && byteIndex>0 && target<targetLimit) {
   1212         goto getTrail;
   1213     }
   1214 
   1215 fastSingle:
   1216     /* fast loop for single-byte differences */
   1217     /* use count as the only loop counter variable */
   1218     diff=(int32_t)(sourceLimit-source);
   1219     count=(int32_t)(pArgs->targetLimit-target);
   1220     if(count>diff) {
   1221         count=diff;
   1222     }
   1223     while(count>0) {
   1224         if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) {
   1225             c=prev+(c-BOCU1_MIDDLE);
   1226             if(c<0x3000) {
   1227                 *target++=(UChar)c;
   1228                 prev=BOCU1_SIMPLE_PREV(c);
   1229             } else {
   1230                 break;
   1231             }
   1232         } else if(c<=0x20) {
   1233             if(c!=0x20) {
   1234                 prev=BOCU1_ASCII_PREV;
   1235             }
   1236             *target++=(UChar)c;
   1237         } else {
   1238             break;
   1239         }
   1240         ++source;
   1241         --count;
   1242     }
   1243 
   1244     /* decode a sequence of single and lead bytes */
   1245     while(source<sourceLimit) {
   1246         if(target>=targetLimit) {
   1247             /* target is full */
   1248             *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1249             break;
   1250         }
   1251 
   1252         c=*source++;
   1253         if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) {
   1254             /* Write a code point directly from a single-byte difference. */
   1255             c=prev+(c-BOCU1_MIDDLE);
   1256             if(c<0x3000) {
   1257                 *target++=(UChar)c;
   1258                 prev=BOCU1_SIMPLE_PREV(c);
   1259                 goto fastSingle;
   1260             }
   1261         } else if(c<=0x20) {
   1262             /*
   1263              * Direct-encoded C0 control code or space.
   1264              * Reset prev for C0 control codes but not for space.
   1265              */
   1266             if(c!=0x20) {
   1267                 prev=BOCU1_ASCII_PREV;
   1268             }
   1269             *target++=(UChar)c;
   1270             continue;
   1271         } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) {
   1272             /* Optimize two-byte case. */
   1273             if(c>=BOCU1_MIDDLE) {
   1274                 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
   1275             } else {
   1276                 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
   1277             }
   1278 
   1279             /* trail byte */
   1280             c=decodeBocu1TrailByte(1, *source++);
   1281             if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) {
   1282                 bytes[0]=source[-2];
   1283                 bytes[1]=source[-1];
   1284                 byteIndex=2;
   1285                 *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1286                 break;
   1287             }
   1288         } else if(c==BOCU1_RESET) {
   1289             /* only reset the state, no code point */
   1290             prev=BOCU1_ASCII_PREV;
   1291             continue;
   1292         } else {
   1293             /*
   1294              * For multi-byte difference lead bytes, set the decoder state
   1295              * with the partial difference value from the lead byte and
   1296              * with the number of trail bytes.
   1297              */
   1298             bytes[0]=(uint8_t)c;
   1299             byteIndex=1;
   1300 
   1301             diff=decodeBocu1LeadByte(c);
   1302             count=diff&3;
   1303             diff>>=2;
   1304 getTrail:
   1305             for(;;) {
   1306                 if(source>=sourceLimit) {
   1307                     goto endloop;
   1308                 }
   1309                 c=bytes[byteIndex++]=*source++;
   1310 
   1311                 /* trail byte in any position */
   1312                 c=decodeBocu1TrailByte(count, c);
   1313                 if(c<0) {
   1314                     *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1315                     goto endloop;
   1316                 }
   1317 
   1318                 diff+=c;
   1319                 if(--count==0) {
   1320                     /* final trail byte, deliver a code point */
   1321                     byteIndex=0;
   1322                     c=prev+diff;
   1323                     if((uint32_t)c>0x10ffff) {
   1324                         *pErrorCode=U_ILLEGAL_CHAR_FOUND;
   1325                         goto endloop;
   1326                     }
   1327                     break;
   1328                 }
   1329             }
   1330         }
   1331 
   1332         /* calculate the next prev and output c */
   1333         prev=BOCU1_PREV(c);
   1334         if(c<=0xffff) {
   1335             *target++=(UChar)c;
   1336         } else {
   1337             /* output surrogate pair */
   1338             *target++=U16_LEAD(c);
   1339             if(target<targetLimit) {
   1340                 *target++=U16_TRAIL(c);
   1341             } else {
   1342                 /* target overflow */
   1343                 cnv->UCharErrorBuffer[0]=U16_TRAIL(c);
   1344                 cnv->UCharErrorBufferLength=1;
   1345                 *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
   1346                 break;
   1347             }
   1348         }
   1349     }
   1350 endloop:
   1351 
   1352     if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) {
   1353         /* set the converter state in UConverter to deal with the next character */
   1354         cnv->toUnicodeStatus=BOCU1_ASCII_PREV;
   1355         cnv->mode=0;
   1356     } else {
   1357         /* set the converter state back into UConverter */
   1358         cnv->toUnicodeStatus=(uint32_t)prev;
   1359         cnv->mode=(diff<<2)|count;
   1360     }
   1361     cnv->toULength=byteIndex;
   1362 
   1363     /* write back the updated pointers */
   1364     pArgs->source=(const char *)source;
   1365     pArgs->target=target;
   1366     return;
   1367 }
   1368 
   1369 /* miscellaneous ------------------------------------------------------------ */
   1370 
   1371 static const UConverterImpl _Bocu1Impl={
   1372     UCNV_BOCU1,
   1373 
   1374     NULL,
   1375     NULL,
   1376 
   1377     NULL,
   1378     NULL,
   1379     NULL,
   1380 
   1381     _Bocu1ToUnicode,
   1382     _Bocu1ToUnicodeWithOffsets,
   1383     _Bocu1FromUnicode,
   1384     _Bocu1FromUnicodeWithOffsets,
   1385     NULL,
   1386 
   1387     NULL,
   1388     NULL,
   1389     NULL,
   1390     NULL,
   1391     ucnv_getCompleteUnicodeSet,
   1392 
   1393     NULL,
   1394     NULL
   1395 };
   1396 
   1397 static const UConverterStaticData _Bocu1StaticData={
   1398     sizeof(UConverterStaticData),
   1399     "BOCU-1",
   1400     1214, /* CCSID for BOCU-1 */
   1401     UCNV_IBM, UCNV_BOCU1,
   1402     1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */
   1403     { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */
   1404     FALSE, FALSE,
   1405     0,
   1406     0,
   1407     { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */
   1408 };
   1409 
   1410 const UConverterSharedData _Bocu1Data=
   1411         UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl);
   1412 
   1413 #endif
   1414