Home | History | Annotate | Download | only in cintltst
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2002-2011, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  bocu1tst.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002may27
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This is the reference implementation of BOCU-1,
     17 *   the MIME-friendly form of the Binary Ordered Compression for Unicode,
     18 *   taken directly from ### http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/bocu1/
     19 *   The files bocu1.h and bocu1.c from the design folder are taken
     20 *   verbatim (minus copyright and #include) and copied together into this file.
     21 *   The reference code and some of the reference bocu1tst.c
     22 *   is modified to run as part of the ICU cintltst
     23 *   test framework (minus main(), log_ln() etc. instead of printf()).
     24 *
     25 *   This reference implementation is used here to verify
     26 *   the ICU BOCU-1 implementation, which is
     27 *   adapted for ICU conversion APIs and optimized.
     28 *   ### links in design doc to here and to ucnvbocu.c
     29 */
     30 
     31 #include "unicode/utypes.h"
     32 #include "unicode/ustring.h"
     33 #include "unicode/ucnv.h"
     34 #include "unicode/utf16.h"
     35 #include "cmemory.h"
     36 #include "cintltst.h"
     37 
     38 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0]))
     39 
     40 /* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */
     41 
     42 /* BOCU-1 constants and macros ---------------------------------------------- */
     43 
     44 /*
     45  * BOCU-1 encodes the code points of a Unicode string as
     46  * a sequence of byte-encoded differences (slope detection),
     47  * preserving lexical order.
     48  *
     49  * Optimize the difference-taking for runs of Unicode text within
     50  * small scripts:
     51  *
     52  * Most small scripts are allocated within aligned 128-blocks of Unicode
     53  * code points. Lexical order is preserved if the "previous code point" state
     54  * is always moved into the middle of such a block.
     55  *
     56  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
     57  * areas into the middle of those areas.
     58  *
     59  * C0 control codes and space are encoded with their US-ASCII bytes.
     60  * "prev" is reset for C0 controls but not for space.
     61  */
     62 
     63 /* initial value for "prev": middle of the ASCII range */
     64 #define BOCU1_ASCII_PREV        0x40
     65 
     66 /* bounding byte values for differences */
     67 #define BOCU1_MIN               0x21
     68 #define BOCU1_MIDDLE            0x90
     69 #define BOCU1_MAX_LEAD          0xfe
     70 
     71 /* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */
     72 #define BOCU1_MAX_TRAIL         0xffL
     73 #define BOCU1_RESET             0xff
     74 
     75 /* number of lead bytes */
     76 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
     77 
     78 /* adjust trail byte counts for the use of some C0 control byte values */
     79 #define BOCU1_TRAIL_CONTROLS_COUNT  20
     80 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
     81 
     82 /* number of trail bytes */
     83 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
     84 
     85 /*
     86  * number of positive and negative single-byte codes
     87  * (counting 0==BOCU1_MIDDLE among the positive ones)
     88  */
     89 #define BOCU1_SINGLE            64
     90 
     91 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
     92 #define BOCU1_LEAD_2            43
     93 #define BOCU1_LEAD_3            3
     94 #define BOCU1_LEAD_4            1
     95 
     96 /* The difference value range for single-byters. */
     97 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
     98 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
     99 
    100 /* The difference value range for double-byters. */
    101 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
    102 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
    103 
    104 /* The difference value range for 3-byters. */
    105 #define BOCU1_REACH_POS_3   \
    106     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
    107 
    108 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
    109 
    110 /* The lead byte start values. */
    111 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
    112 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
    113 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
    114      /* ==BOCU1_MAX_LEAD */
    115 
    116 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
    117 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
    118 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
    119      /* ==BOCU1_MIN+1 */
    120 
    121 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
    122 #define BOCU1_LENGTH_FROM_LEAD(lead) \
    123     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
    124      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
    125      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
    126 
    127 /* The length of a byte sequence, according to its packed form. */
    128 #define BOCU1_LENGTH_FROM_PACKED(packed) \
    129     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
    130 
    131 /*
    132  * 12 commonly used C0 control codes (and space) are only used to encode
    133  * themselves directly,
    134  * which makes BOCU-1 MIME-usable and reasonably safe for
    135  * ASCII-oriented software.
    136  *
    137  * These controls are
    138  *  0   NUL
    139  *
    140  *  7   BEL
    141  *  8   BS
    142  *
    143  *  9   TAB
    144  *  a   LF
    145  *  b   VT
    146  *  c   FF
    147  *  d   CR
    148  *
    149  *  e   SO
    150  *  f   SI
    151  *
    152  * 1a   SUB
    153  * 1b   ESC
    154  *
    155  * The other 20 C0 controls are also encoded directly (to preserve order)
    156  * but are also used as trail bytes in difference encoding
    157  * (for better compression).
    158  */
    159 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
    160 
    161 /*
    162  * Byte value map for control codes,
    163  * from external byte values 0x00..0x20
    164  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
    165  * External byte values that are illegal as trail bytes are mapped to -1.
    166  */
    167 static const int8_t
    168 bocu1ByteToTrail[BOCU1_MIN]={
    169 /*  0     1     2     3     4     5     6     7    */
    170     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
    171 
    172 /*  8     9     a     b     c     d     e     f    */
    173     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
    174 
    175 /*  10    11    12    13    14    15    16    17   */
    176     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
    177 
    178 /*  18    19    1a    1b    1c    1d    1e    1f   */
    179     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
    180 
    181 /*  20   */
    182     -1
    183 };
    184 
    185 /*
    186  * Byte value map for control codes,
    187  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
    188  * to external byte values 0x00..0x20.
    189  */
    190 static const int8_t
    191 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
    192 /*  0     1     2     3     4     5     6     7    */
    193     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
    194 
    195 /*  8     9     a     b     c     d     e     f    */
    196     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
    197 
    198 /*  10    11    12    13   */
    199     0x1c, 0x1d, 0x1e, 0x1f
    200 };
    201 
    202 /**
    203  * Integer division and modulo with negative numerators
    204  * yields negative modulo results and quotients that are one more than
    205  * what we need here.
    206  * This macro adjust the results so that the modulo-value m is always >=0.
    207  *
    208  * For positive n, the if() condition is always FALSE.
    209  *
    210  * @param n Number to be split into quotient and rest.
    211  *          Will be modified to contain the quotient.
    212  * @param d Divisor.
    213  * @param m Output variable for the rest (modulo result).
    214  */
    215 #define NEGDIVMOD(n, d, m) { \
    216     (m)=(n)%(d); \
    217     (n)/=(d); \
    218     if((m)<0) { \
    219         --(n); \
    220         (m)+=(d); \
    221     } \
    222 }
    223 
    224 /* State for BOCU-1 decoder function. */
    225 struct Bocu1Rx {
    226     int32_t prev, count, diff;
    227 };
    228 
    229 typedef struct Bocu1Rx Bocu1Rx;
    230 
    231 /* Function prototypes ------------------------------------------------------ */
    232 
    233 /* see bocu1.c */
    234 U_CFUNC int32_t
    235 packDiff(int32_t diff);
    236 
    237 U_CFUNC int32_t
    238 encodeBocu1(int32_t *pPrev, int32_t c);
    239 
    240 U_CFUNC int32_t
    241 decodeBocu1(Bocu1Rx *pRx, uint8_t b);
    242 
    243 /* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */
    244 
    245 /* BOCU-1 implementation functions ------------------------------------------ */
    246 
    247 /**
    248  * Compute the next "previous" value for differencing
    249  * from the current code point.
    250  *
    251  * @param c current code point, 0..0x10ffff
    252  * @return "previous code point" state value
    253  */
    254 static int32_t
    255 bocu1Prev(int32_t c) {
    256     /* compute new prev */
    257     if(0x3040<=c && c<=0x309f) {
    258         /* Hiragana is not 128-aligned */
    259         return 0x3070;
    260     } else if(0x4e00<=c && c<=0x9fa5) {
    261         /* CJK Unihan */
    262         return 0x4e00-BOCU1_REACH_NEG_2;
    263     } else if(0xac00<=c && c<=0xd7a3) {
    264         /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compilers) */
    265         return ((int32_t)0xd7a3+(int32_t)0xac00)/2;
    266     } else {
    267         /* mostly small scripts */
    268         return (c&~0x7f)+BOCU1_ASCII_PREV;
    269     }
    270 }
    271 
    272 /**
    273  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
    274  * and return a packed integer with them.
    275  *
    276  * The encoding favors small absolut differences with short encodings
    277  * to compress runs of same-script characters.
    278  *
    279  * @param diff difference value -0x10ffff..0x10ffff
    280  * @return
    281  *      0x010000zz for 1-byte sequence zz
    282  *      0x0200yyzz for 2-byte sequence yy zz
    283  *      0x03xxyyzz for 3-byte sequence xx yy zz
    284  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
    285  */
    286 U_CFUNC int32_t
    287 packDiff(int32_t diff) {
    288     int32_t result, m, lead, count, shift;
    289 
    290     if(diff>=BOCU1_REACH_NEG_1) {
    291         /* mostly positive differences, and single-byte negative ones */
    292         if(diff<=BOCU1_REACH_POS_1) {
    293             /* single byte */
    294             return 0x01000000|(BOCU1_MIDDLE+diff);
    295         } else if(diff<=BOCU1_REACH_POS_2) {
    296             /* two bytes */
    297             diff-=BOCU1_REACH_POS_1+1;
    298             lead=BOCU1_START_POS_2;
    299             count=1;
    300         } else if(diff<=BOCU1_REACH_POS_3) {
    301             /* three bytes */
    302             diff-=BOCU1_REACH_POS_2+1;
    303             lead=BOCU1_START_POS_3;
    304             count=2;
    305         } else {
    306             /* four bytes */
    307             diff-=BOCU1_REACH_POS_3+1;
    308             lead=BOCU1_START_POS_4;
    309             count=3;
    310         }
    311     } else {
    312         /* two- and four-byte negative differences */
    313         if(diff>=BOCU1_REACH_NEG_2) {
    314             /* two bytes */
    315             diff-=BOCU1_REACH_NEG_1;
    316             lead=BOCU1_START_NEG_2;
    317             count=1;
    318         } else if(diff>=BOCU1_REACH_NEG_3) {
    319             /* three bytes */
    320             diff-=BOCU1_REACH_NEG_2;
    321             lead=BOCU1_START_NEG_3;
    322             count=2;
    323         } else {
    324             /* four bytes */
    325             diff-=BOCU1_REACH_NEG_3;
    326             lead=BOCU1_START_NEG_4;
    327             count=3;
    328         }
    329     }
    330 
    331     /* encode the length of the packed result */
    332     if(count<3) {
    333         result=(count+1)<<24;
    334     } else /* count==3, MSB used for the lead byte */ {
    335         result=0;
    336     }
    337 
    338     /* calculate trail bytes like digits in itoa() */
    339     shift=0;
    340     do {
    341         NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    342         result|=BOCU1_TRAIL_TO_BYTE(m)<<shift;
    343         shift+=8;
    344     } while(--count>0);
    345 
    346     /* add lead byte */
    347     result|=(lead+diff)<<shift;
    348 
    349     return result;
    350 }
    351 
    352 /**
    353  * BOCU-1 encoder function.
    354  *
    355  * @param pPrev pointer to the integer that holds
    356  *        the "previous code point" state;
    357  *        the initial value should be 0 which
    358  *        encodeBocu1 will set to the actual BOCU-1 initial state value
    359  * @param c the code point to encode
    360  * @return the packed 1/2/3/4-byte encoding, see packDiff(),
    361  *         or 0 if an error occurs
    362  *
    363  * @see packDiff
    364  */
    365 U_CFUNC int32_t
    366 encodeBocu1(int32_t *pPrev, int32_t c) {
    367     int32_t prev;
    368 
    369     if(pPrev==NULL || c<0 || c>0x10ffff) {
    370         /* illegal argument */
    371         return 0;
    372     }
    373 
    374     prev=*pPrev;
    375     if(prev==0) {
    376         /* lenient handling of initial value 0 */
    377         prev=*pPrev=BOCU1_ASCII_PREV;
    378     }
    379 
    380     if(c<=0x20) {
    381         /*
    382          * ISO C0 control & space:
    383          * Encode directly for MIME compatibility,
    384          * and reset state except for space, to not disrupt compression.
    385          */
    386         if(c!=0x20) {
    387             *pPrev=BOCU1_ASCII_PREV;
    388         }
    389         return 0x01000000|c;
    390     }
    391 
    392     /*
    393      * all other Unicode code points c==U+0021..U+10ffff
    394      * are encoded with the difference c-prev
    395      *
    396      * a new prev is computed from c,
    397      * placed in the middle of a 0x80-block (for most small scripts) or
    398      * in the middle of the Unihan and Hangul blocks
    399      * to statistically minimize the following difference
    400      */
    401     *pPrev=bocu1Prev(c);
    402     return packDiff(c-prev);
    403 }
    404 
    405 /**
    406  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
    407  *
    408  * @param pRx pointer to the decoder state structure
    409  * @param b lead byte;
    410  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD
    411  * @return -1 (state change only)
    412  *
    413  * @see decodeBocu1
    414  */
    415 static int32_t
    416 decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) {
    417     int32_t c, count;
    418 
    419     if(b>=BOCU1_START_NEG_2) {
    420         /* positive difference */
    421         if(b<BOCU1_START_POS_3) {
    422             /* two bytes */
    423             c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
    424             count=1;
    425         } else if(b<BOCU1_START_POS_4) {
    426             /* three bytes */
    427             c=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
    428             count=2;
    429         } else {
    430             /* four bytes */
    431             c=BOCU1_REACH_POS_3+1;
    432             count=3;
    433         }
    434     } else {
    435         /* negative difference */
    436         if(b>=BOCU1_START_NEG_3) {
    437             /* two bytes */
    438             c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
    439             count=1;
    440         } else if(b>BOCU1_MIN) {
    441             /* three bytes */
    442             c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
    443             count=2;
    444         } else {
    445             /* four bytes */
    446             c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
    447             count=3;
    448         }
    449     }
    450 
    451     /* set the state for decoding the trail byte(s) */
    452     pRx->diff=c;
    453     pRx->count=count;
    454     return -1;
    455 }
    456 
    457 /**
    458  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
    459  *
    460  * @param pRx pointer to the decoder state structure
    461  * @param b trail byte
    462  * @return result value, same as decodeBocu1
    463  *
    464  * @see decodeBocu1
    465  */
    466 static int32_t
    467 decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) {
    468     int32_t t, c, count;
    469 
    470     if(b<=0x20) {
    471         /* skip some C0 controls and make the trail byte range contiguous */
    472         t=bocu1ByteToTrail[b];
    473         if(t<0) {
    474             /* illegal trail byte value */
    475             pRx->prev=BOCU1_ASCII_PREV;
    476             pRx->count=0;
    477             return -99;
    478         }
    479 #if BOCU1_MAX_TRAIL<0xff
    480     } else if(b>BOCU1_MAX_TRAIL) {
    481         return -99;
    482 #endif
    483     } else {
    484         t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET;
    485     }
    486 
    487     /* add trail byte into difference and decrement count */
    488     c=pRx->diff;
    489     count=pRx->count;
    490 
    491     if(count==1) {
    492         /* final trail byte, deliver a code point */
    493         c=pRx->prev+c+t;
    494         if(0<=c && c<=0x10ffff) {
    495             /* valid code point result */
    496             pRx->prev=bocu1Prev(c);
    497             pRx->count=0;
    498             return c;
    499         } else {
    500             /* illegal code point result */
    501             pRx->prev=BOCU1_ASCII_PREV;
    502             pRx->count=0;
    503             return -99;
    504         }
    505     }
    506 
    507     /* intermediate trail byte */
    508     if(count==2) {
    509         pRx->diff=c+t*BOCU1_TRAIL_COUNT;
    510     } else /* count==3 */ {
    511         pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT;
    512     }
    513     pRx->count=count-1;
    514     return -1;
    515 }
    516 
    517 /**
    518  * BOCU-1 decoder function.
    519  *
    520  * @param pRx pointer to the decoder state structure;
    521  *        the initial values should be 0 which
    522  *        decodeBocu1 will set to actual initial state values
    523  * @param b an input byte
    524  * @return
    525  *      0..0x10ffff for a result code point
    526  *      -1 if only the state changed without code point output
    527  *     <-1 if an error occurs
    528  */
    529 U_CFUNC int32_t
    530 decodeBocu1(Bocu1Rx *pRx, uint8_t b) {
    531     int32_t prev, c, count;
    532 
    533     if(pRx==NULL) {
    534         /* illegal argument */
    535         return -99;
    536     }
    537 
    538     prev=pRx->prev;
    539     if(prev==0) {
    540         /* lenient handling of initial 0 values */
    541         prev=pRx->prev=BOCU1_ASCII_PREV;
    542         count=pRx->count=0;
    543     } else {
    544         count=pRx->count;
    545     }
    546 
    547     if(count==0) {
    548         /* byte in lead position */
    549         if(b<=0x20) {
    550             /*
    551              * Direct-encoded C0 control code or space.
    552              * Reset prev for C0 control codes but not for space.
    553              */
    554             if(b!=0x20) {
    555                 pRx->prev=BOCU1_ASCII_PREV;
    556             }
    557             return b;
    558         }
    559 
    560         /*
    561          * b is a difference lead byte.
    562          *
    563          * Return a code point directly from a single-byte difference.
    564          *
    565          * For multi-byte difference lead bytes, set the decoder state
    566          * with the partial difference value from the lead byte and
    567          * with the number of trail bytes.
    568          *
    569          * For four-byte differences, the signedness also affects the
    570          * first trail byte, which has special handling farther below.
    571          */
    572         if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) {
    573             /* single-byte difference */
    574             c=prev+((int32_t)b-BOCU1_MIDDLE);
    575             pRx->prev=bocu1Prev(c);
    576             return c;
    577         } else if(b==BOCU1_RESET) {
    578             /* only reset the state, no code point */
    579             pRx->prev=BOCU1_ASCII_PREV;
    580             return -1;
    581         } else {
    582             return decodeBocu1LeadByte(pRx, b);
    583         }
    584     } else {
    585         /* trail byte in any position */
    586         return decodeBocu1TrailByte(pRx, b);
    587     }
    588 }
    589 
    590 /* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */
    591 
    592 /* test code ---------------------------------------------------------------- */
    593 
    594 /* test code options */
    595 
    596 /* ignore comma when processing name lists in testText() */
    597 #define TEST_IGNORE_COMMA       1
    598 
    599 /**
    600  * Write a packed BOCU-1 byte sequence into a byte array,
    601  * without overflow check.
    602  * Test function.
    603  *
    604  * @param packed packed BOCU-1 byte sequence, see packDiff()
    605  * @param p pointer to byte array
    606  * @return number of bytes
    607  *
    608  * @see packDiff
    609  */
    610 static int32_t
    611 writePacked(int32_t packed, uint8_t *p) {
    612     int32_t count=BOCU1_LENGTH_FROM_PACKED(packed);
    613     switch(count) {
    614     case 4:
    615         *p++=(uint8_t)(packed>>24);
    616     case 3:
    617         *p++=(uint8_t)(packed>>16);
    618     case 2:
    619         *p++=(uint8_t)(packed>>8);
    620     case 1:
    621         *p++=(uint8_t)packed;
    622     default:
    623         break;
    624     }
    625 
    626     return count;
    627 }
    628 
    629 /**
    630  * Unpack a packed BOCU-1 non-C0/space byte sequence and get
    631  * the difference to initialPrev.
    632  * Used only for round-trip testing of the difference encoding and decoding.
    633  * Test function.
    634  *
    635  * @param initialPrev bogus "previous code point" value to make sure that
    636  *                    the resulting code point is in the range 0..0x10ffff
    637  * @param packed packed BOCU-1 byte sequence
    638  * @return the difference to initialPrev
    639  *
    640  * @see packDiff
    641  * @see writeDiff
    642  */
    643 static int32_t
    644 unpackDiff(int32_t initialPrev, int32_t packed) {
    645     Bocu1Rx rx={ 0, 0, 0 };
    646     int32_t count;
    647 
    648     rx.prev=initialPrev;
    649     count=BOCU1_LENGTH_FROM_PACKED(packed);
    650     switch(count) {
    651     case 4:
    652         decodeBocu1(&rx, (uint8_t)(packed>>24));
    653     case 3:
    654         decodeBocu1(&rx, (uint8_t)(packed>>16));
    655     case 2:
    656         decodeBocu1(&rx, (uint8_t)(packed>>8));
    657     case 1:
    658         /* subtract initial prev */
    659         return decodeBocu1(&rx, (uint8_t)packed)-initialPrev;
    660     default:
    661         return -0x7fffffff;
    662     }
    663 }
    664 
    665 /**
    666  * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
    667  * preserving lexical order.
    668  * Also checks for roundtripping of the difference encoding.
    669  * Test function.
    670  *
    671  * @param diff difference value to test, -0x10ffff..0x10ffff
    672  * @param p pointer to output byte array
    673  * @return p advanced by number of bytes output
    674  *
    675  * @see unpackDiff
    676  */
    677 static uint8_t *
    678 writeDiff(int32_t diff, uint8_t *p) {
    679     /* generate the difference as a packed value and serialize it */
    680     int32_t packed, initialPrev;
    681 
    682     packed=packDiff(diff);
    683 
    684     /*
    685      * bogus initial "prev" to work around
    686      * code point range check in decodeBocu1()
    687      */
    688     if(diff<=0) {
    689         initialPrev=0x10ffff;
    690     } else {
    691         initialPrev=-1;
    692     }
    693 
    694     if(diff!=unpackDiff(initialPrev, packed)) {
    695         log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n",
    696                 diff, packed, unpackDiff(initialPrev, packed));
    697     }
    698     return p+writePacked(packed, p);
    699 }
    700 
    701 /**
    702  * Encode a UTF-16 string in BOCU-1.
    703  * Does not check for overflows, but otherwise useful function.
    704  *
    705  * @param s input UTF-16 string
    706  * @param length number of UChar code units in s
    707  * @param p pointer to output byte array
    708  * @return number of bytes output
    709  */
    710 static int32_t
    711 writeString(const UChar *s, int32_t length, uint8_t *p) {
    712     uint8_t *p0;
    713     int32_t c, prev, i;
    714 
    715     prev=0;
    716     p0=p;
    717     i=0;
    718     while(i<length) {
    719         U16_NEXT(s, i, length, c);
    720         p+=writePacked(encodeBocu1(&prev, c), p);
    721     }
    722     return (int32_t)(p-p0);
    723 }
    724 
    725 /**
    726  * Decode a BOCU-1 byte sequence to a UTF-16 string.
    727  * Does not check for overflows, but otherwise useful function.
    728  *
    729  * @param p pointer to input BOCU-1 bytes
    730  * @param length number of input bytes
    731  * @param s point to output UTF-16 string array
    732  * @return number of UChar code units output
    733  */
    734 static int32_t
    735 readString(const uint8_t *p, int32_t length, UChar *s) {
    736     Bocu1Rx rx={ 0, 0, 0 };
    737     int32_t c, i, sLength;
    738 
    739     i=sLength=0;
    740     while(i<length) {
    741         c=decodeBocu1(&rx, p[i++]);
    742         if(c<-1) {
    743             log_err("error: readString detects encoding error at string index %ld\n", i);
    744             return -1;
    745         }
    746         if(c>=0) {
    747             U16_APPEND_UNSAFE(s, sLength, c);
    748         }
    749     }
    750     return sLength;
    751 }
    752 
    753 static char
    754 hexDigit(uint8_t digit) {
    755     return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
    756 }
    757 
    758 /**
    759  * Pretty-print 0-terminated byte values.
    760  * Helper function for test output.
    761  *
    762  * @param bytes 0-terminated byte array to print
    763  */
    764 static void
    765 printBytes(uint8_t *bytes, char *out) {
    766     int i;
    767     uint8_t b;
    768 
    769     i=0;
    770     while((b=*bytes++)!=0) {
    771         *out++=' ';
    772         *out++=hexDigit((uint8_t)(b>>4));
    773         *out++=hexDigit((uint8_t)(b&0xf));
    774         ++i;
    775     }
    776     i=3*(5-i);
    777     while(i>0) {
    778         *out++=' ';
    779         --i;
    780     }
    781     *out=0;
    782 }
    783 
    784 /**
    785  * Basic BOCU-1 test function, called when there are no command line arguments.
    786  * Prints some of the #define values and performs round-trip tests of the
    787  * difference encoding and decoding.
    788  */
    789 static void
    790 TestBOCU1RefDiff(void) {
    791     char buf1[80], buf2[80];
    792     uint8_t prev[5], level[5];
    793     int32_t i, cmp, countErrors;
    794 
    795     log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1);
    796     log_verbose("reach of 2 bytes     : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2);
    797     log_verbose("reach of 3 bytes     : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3);
    798 
    799     log_verbose("    BOCU1_REACH_NEG_1 %8ld    BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1);
    800     log_verbose("    BOCU1_REACH_NEG_2 %8ld    BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2);
    801     log_verbose("    BOCU1_REACH_NEG_3 %8ld    BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3);
    802 
    803     log_verbose("    BOCU1_MIDDLE      0x%02x\n", BOCU1_MIDDLE);
    804     log_verbose("    BOCU1_START_NEG_2 0x%02x    BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2);
    805     log_verbose("    BOCU1_START_NEG_3 0x%02x    BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3);
    806 
    807     /* test packDiff() & unpackDiff() with some specific values */
    808     writeDiff(0, level);
    809     writeDiff(1, level);
    810     writeDiff(65, level);
    811     writeDiff(130, level);
    812     writeDiff(30000, level);
    813     writeDiff(1000000, level);
    814     writeDiff(-65, level);
    815     writeDiff(-130, level);
    816     writeDiff(-30000, level);
    817     writeDiff(-1000000, level);
    818 
    819     /* test that each value is smaller than any following one */
    820     countErrors=0;
    821     i=-0x10ffff;
    822     *writeDiff(i, prev)=0;
    823 
    824     /* show first number and bytes */
    825     printBytes(prev, buf1);
    826     log_verbose("              wD(%8ld)                    %s\n", i, buf1);
    827 
    828     for(++i; i<=0x10ffff; ++i) {
    829         *writeDiff(i, level)=0;
    830         cmp=strcmp((const char *)prev, (const char *)level);
    831         if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) {
    832             log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n",
    833                    level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i);
    834         }
    835         if(cmp<0) {
    836             if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) {
    837                 /*
    838                  * if the result is good, then print only if the length changed
    839                  * to get little but interesting output
    840                  */
    841                 printBytes(prev, buf1);
    842                 printBytes(level, buf2);
    843                 log_verbose("ok:    strcmp(wD(%8ld), wD(%8ld))=%2d  %s%s\n", i-1, i, cmp, buf1, buf2);
    844             }
    845         } else {
    846             ++countErrors;
    847             printBytes(prev, buf1);
    848             printBytes(level, buf2);
    849             log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d  %s%s\n", i-1, i, cmp, buf1, buf2);
    850         }
    851         /* remember the previous bytes */
    852         memcpy(prev, level, 4);
    853     }
    854 
    855     /* show last number and bytes */
    856     printBytes((uint8_t *)"", buf1);
    857     printBytes(prev, buf2);
    858     log_verbose("                            wD(%8ld)      %s%s\n", i-1, buf1, buf2);
    859 
    860     if(countErrors==0) {
    861         log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n");
    862     } else {
    863         log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors);
    864     }
    865 
    866     /* output signature byte sequence */
    867     i=0;
    868     writePacked(encodeBocu1(&i, 0xfeff), level);
    869     log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n",
    870             level[0], level[1], level[2]);
    871 }
    872 
    873 /* cintltst code ------------------------------------------------------------ */
    874 
    875 static const int32_t DEFAULT_BUFFER_SIZE = 30000;
    876 
    877 
    878 /* test one string with the ICU and the reference BOCU-1 implementations */
    879 static void
    880 roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) {
    881     UChar *roundtripRef, *roundtripICU;
    882     char *bocu1Ref, *bocu1ICU;
    883 
    884     int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength;
    885     UErrorCode errorCode;
    886 
    887     roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
    888     roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
    889     bocu1Ref = malloc(DEFAULT_BUFFER_SIZE);
    890     bocu1ICU = malloc(DEFAULT_BUFFER_SIZE);
    891 
    892     /* Unicode -> BOCU-1 */
    893     bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref);
    894 
    895     errorCode=U_ZERO_ERROR;
    896     bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, length, &errorCode);
    897     if(U_FAILURE(errorCode)) {
    898         log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
    899         goto cleanup;
    900     }
    901 
    902     if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) {
    903         log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength);
    904         goto cleanup;
    905     }
    906 
    907     /* BOCU-1 -> Unicode */
    908     roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef);
    909     if(roundtripRefLength<0) {
    910         goto cleanup; /* readString() found an error and reported it */
    911     }
    912 
    913     roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, bocu1ICU, bocu1ICULength, &errorCode);
    914     if(U_FAILURE(errorCode)) {
    915         log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
    916         goto cleanup;
    917     }
    918 
    919     if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) {
    920         log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength);
    921         goto cleanup;
    922     }
    923     if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) {
    924         log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength);
    925         goto cleanup;
    926     }
    927 cleanup:
    928     free(roundtripRef);
    929     free(roundtripICU);
    930     free(bocu1Ref);
    931     free(bocu1ICU);
    932 }
    933 
    934 static const UChar feff[]={ 0xfeff };
    935 static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 };
    936 static const UChar crlf[]={ 0xd, 0xa, 0x20 };
    937 static const UChar nul[]={ 0 };
    938 static const UChar latin[]={ 0xdf, 0xe6 };
    939 static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 };
    940 static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 };
    941 static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 };
    942 static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 };
    943 static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */
    944 static const UChar plane1[]={ 0xd800, 0xdc00 };
    945 static const UChar plane2[]={ 0xd845, 0xdddd };
    946 static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 };
    947 static const UChar plane16[]={ 0xdbff, 0xdfff };
    948 static const UChar c0[]={ 1, 0xe40, 0x20, 9 };
    949 
    950 static const struct {
    951     const UChar *s;
    952     int32_t length;
    953 } strings[]={
    954     { feff,         LENGTHOF(feff) },
    955     { ascii,        LENGTHOF(ascii) },
    956     { crlf,         LENGTHOF(crlf) },
    957     { nul,          LENGTHOF(nul) },
    958     { latin,        LENGTHOF(latin) },
    959     { devanagari,   LENGTHOF(devanagari) },
    960     { hiragana,     LENGTHOF(hiragana) },
    961     { unihan,       LENGTHOF(unihan) },
    962     { hangul,       LENGTHOF(hangul) },
    963     { surrogates,   LENGTHOF(surrogates) },
    964     { plane1,       LENGTHOF(plane1) },
    965     { plane2,       LENGTHOF(plane2) },
    966     { plane15,      LENGTHOF(plane15) },
    967     { plane16,      LENGTHOF(plane16) },
    968     { c0,           LENGTHOF(c0) }
    969 };
    970 
    971 /*
    972  * Verify that the ICU BOCU-1 implementation produces the same results as
    973  * the reference implementation from the design folder.
    974  * Generate some texts and convert them with both converters, verifying
    975  * identical results and roundtripping.
    976  */
    977 static void
    978 TestBOCU1(void) {
    979     UChar *text;
    980     int32_t i, length;
    981 
    982     UConverter *bocu1;
    983     UErrorCode errorCode;
    984 
    985     errorCode=U_ZERO_ERROR;
    986     bocu1=ucnv_open("BOCU-1", &errorCode);
    987     if(U_FAILURE(errorCode)) {
    988         log_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode));
    989         return;
    990     }
    991 
    992     text = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
    993 
    994     /* text 1: each of strings[] once */
    995     length=0;
    996     for(i=0; i<LENGTHOF(strings); ++i) {
    997         u_memcpy(text+length, strings[i].s, strings[i].length);
    998         length+=strings[i].length;
    999     }
   1000     roundtripBOCU1(bocu1, 1, text, length);
   1001 
   1002     /* text 2: each of strings[] twice */
   1003     length=0;
   1004     for(i=0; i<LENGTHOF(strings); ++i) {
   1005         u_memcpy(text+length, strings[i].s, strings[i].length);
   1006         length+=strings[i].length;
   1007         u_memcpy(text+length, strings[i].s, strings[i].length);
   1008         length+=strings[i].length;
   1009     }
   1010     roundtripBOCU1(bocu1, 2, text, length);
   1011 
   1012     /* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */
   1013     length=0;
   1014     for(i=1; length<5000; i+=7) {
   1015         if(i>=LENGTHOF(strings)) {
   1016             i-=LENGTHOF(strings);
   1017         }
   1018         u_memcpy(text+length, strings[i].s, strings[i].length);
   1019         length+=strings[i].length;
   1020     }
   1021     roundtripBOCU1(bocu1, 3, text, length);
   1022 
   1023     ucnv_close(bocu1);
   1024     free(text);
   1025 }
   1026 
   1027 U_CFUNC void addBOCU1Tests(TestNode** root);
   1028 
   1029 U_CFUNC void
   1030 addBOCU1Tests(TestNode** root) {
   1031     addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff");
   1032     addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1");
   1033 }
   1034