Home | History | Annotate | Download | only in cintltst
      1 /*
      2 ******************************************************************************
      3 *
      4 *   Copyright (C) 2002-2015, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 ******************************************************************************
      8 *   file name:  bocu1tst.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2002may27
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This is the reference implementation of BOCU-1,
     17 *   the MIME-friendly form of the Binary Ordered Compression for Unicode,
     18 *   taken directly from ### http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/bocu1/
     19 *   The files bocu1.h and bocu1.c from the design folder are taken
     20 *   verbatim (minus copyright and #include) and copied together into this file.
     21 *   The reference code and some of the reference bocu1tst.c
     22 *   is modified to run as part of the ICU cintltst
     23 *   test framework (minus main(), log_ln() etc. instead of printf()).
     24 *
     25 *   This reference implementation is used here to verify
     26 *   the ICU BOCU-1 implementation, which is
     27 *   adapted for ICU conversion APIs and optimized.
     28 *   ### links in design doc to here and to ucnvbocu.c
     29 */
     30 
     31 #include "unicode/utypes.h"
     32 #include "unicode/ustring.h"
     33 #include "unicode/ucnv.h"
     34 #include "unicode/utf16.h"
     35 #include "cmemory.h"
     36 #include "cintltst.h"
     37 
     38 /* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */
     39 
     40 /* BOCU-1 constants and macros ---------------------------------------------- */
     41 
     42 /*
     43  * BOCU-1 encodes the code points of a Unicode string as
     44  * a sequence of byte-encoded differences (slope detection),
     45  * preserving lexical order.
     46  *
     47  * Optimize the difference-taking for runs of Unicode text within
     48  * small scripts:
     49  *
     50  * Most small scripts are allocated within aligned 128-blocks of Unicode
     51  * code points. Lexical order is preserved if the "previous code point" state
     52  * is always moved into the middle of such a block.
     53  *
     54  * Additionally, "prev" is moved from anywhere in the Unihan and Hangul
     55  * areas into the middle of those areas.
     56  *
     57  * C0 control codes and space are encoded with their US-ASCII bytes.
     58  * "prev" is reset for C0 controls but not for space.
     59  */
     60 
     61 /* initial value for "prev": middle of the ASCII range */
     62 #define BOCU1_ASCII_PREV        0x40
     63 
     64 /* bounding byte values for differences */
     65 #define BOCU1_MIN               0x21
     66 #define BOCU1_MIDDLE            0x90
     67 #define BOCU1_MAX_LEAD          0xfe
     68 
     69 /* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */
     70 #define BOCU1_MAX_TRAIL         0xffL
     71 #define BOCU1_RESET             0xff
     72 
     73 /* number of lead bytes */
     74 #define BOCU1_COUNT             (BOCU1_MAX_LEAD-BOCU1_MIN+1)
     75 
     76 /* adjust trail byte counts for the use of some C0 control byte values */
     77 #define BOCU1_TRAIL_CONTROLS_COUNT  20
     78 #define BOCU1_TRAIL_BYTE_OFFSET     (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT)
     79 
     80 /* number of trail bytes */
     81 #define BOCU1_TRAIL_COUNT       ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT)
     82 
     83 /*
     84  * number of positive and negative single-byte codes
     85  * (counting 0==BOCU1_MIDDLE among the positive ones)
     86  */
     87 #define BOCU1_SINGLE            64
     88 
     89 /* number of lead bytes for positive and negative 2/3/4-byte sequences */
     90 #define BOCU1_LEAD_2            43
     91 #define BOCU1_LEAD_3            3
     92 #define BOCU1_LEAD_4            1
     93 
     94 /* The difference value range for single-byters. */
     95 #define BOCU1_REACH_POS_1   (BOCU1_SINGLE-1)
     96 #define BOCU1_REACH_NEG_1   (-BOCU1_SINGLE)
     97 
     98 /* The difference value range for double-byters. */
     99 #define BOCU1_REACH_POS_2   (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
    100 #define BOCU1_REACH_NEG_2   (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT)
    101 
    102 /* The difference value range for 3-byters. */
    103 #define BOCU1_REACH_POS_3   \
    104     (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
    105 
    106 #define BOCU1_REACH_NEG_3   (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT)
    107 
    108 /* The lead byte start values. */
    109 #define BOCU1_START_POS_2   (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1)
    110 #define BOCU1_START_POS_3   (BOCU1_START_POS_2+BOCU1_LEAD_2)
    111 #define BOCU1_START_POS_4   (BOCU1_START_POS_3+BOCU1_LEAD_3)
    112      /* ==BOCU1_MAX_LEAD */
    113 
    114 #define BOCU1_START_NEG_2   (BOCU1_MIDDLE+BOCU1_REACH_NEG_1)
    115 #define BOCU1_START_NEG_3   (BOCU1_START_NEG_2-BOCU1_LEAD_2)
    116 #define BOCU1_START_NEG_4   (BOCU1_START_NEG_3-BOCU1_LEAD_3)
    117      /* ==BOCU1_MIN+1 */
    118 
    119 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */
    120 #define BOCU1_LENGTH_FROM_LEAD(lead) \
    121     ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \
    122      (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \
    123      (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4)
    124 
    125 /* The length of a byte sequence, according to its packed form. */
    126 #define BOCU1_LENGTH_FROM_PACKED(packed) \
    127     ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4)
    128 
    129 /*
    130  * 12 commonly used C0 control codes (and space) are only used to encode
    131  * themselves directly,
    132  * which makes BOCU-1 MIME-usable and reasonably safe for
    133  * ASCII-oriented software.
    134  *
    135  * These controls are
    136  *  0   NUL
    137  *
    138  *  7   BEL
    139  *  8   BS
    140  *
    141  *  9   TAB
    142  *  a   LF
    143  *  b   VT
    144  *  c   FF
    145  *  d   CR
    146  *
    147  *  e   SO
    148  *  f   SI
    149  *
    150  * 1a   SUB
    151  * 1b   ESC
    152  *
    153  * The other 20 C0 controls are also encoded directly (to preserve order)
    154  * but are also used as trail bytes in difference encoding
    155  * (for better compression).
    156  */
    157 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t])
    158 
    159 /*
    160  * Byte value map for control codes,
    161  * from external byte values 0x00..0x20
    162  * to trail byte values 0..19 (0..0x13) as used in the difference calculation.
    163  * External byte values that are illegal as trail bytes are mapped to -1.
    164  */
    165 static const int8_t
    166 bocu1ByteToTrail[BOCU1_MIN]={
    167 /*  0     1     2     3     4     5     6     7    */
    168     -1,   0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1,
    169 
    170 /*  8     9     a     b     c     d     e     f    */
    171     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
    172 
    173 /*  10    11    12    13    14    15    16    17   */
    174     0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d,
    175 
    176 /*  18    19    1a    1b    1c    1d    1e    1f   */
    177     0x0e, 0x0f, -1,   -1,   0x10, 0x11, 0x12, 0x13,
    178 
    179 /*  20   */
    180     -1
    181 };
    182 
    183 /*
    184  * Byte value map for control codes,
    185  * from trail byte values 0..19 (0..0x13) as used in the difference calculation
    186  * to external byte values 0x00..0x20.
    187  */
    188 static const int8_t
    189 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={
    190 /*  0     1     2     3     4     5     6     7    */
    191     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11,
    192 
    193 /*  8     9     a     b     c     d     e     f    */
    194     0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
    195 
    196 /*  10    11    12    13   */
    197     0x1c, 0x1d, 0x1e, 0x1f
    198 };
    199 
    200 /**
    201  * Integer division and modulo with negative numerators
    202  * yields negative modulo results and quotients that are one more than
    203  * what we need here.
    204  * This macro adjust the results so that the modulo-value m is always >=0.
    205  *
    206  * For positive n, the if() condition is always FALSE.
    207  *
    208  * @param n Number to be split into quotient and rest.
    209  *          Will be modified to contain the quotient.
    210  * @param d Divisor.
    211  * @param m Output variable for the rest (modulo result).
    212  */
    213 #define NEGDIVMOD(n, d, m) { \
    214     (m)=(n)%(d); \
    215     (n)/=(d); \
    216     if((m)<0) { \
    217         --(n); \
    218         (m)+=(d); \
    219     } \
    220 }
    221 
    222 /* State for BOCU-1 decoder function. */
    223 struct Bocu1Rx {
    224     int32_t prev, count, diff;
    225 };
    226 
    227 typedef struct Bocu1Rx Bocu1Rx;
    228 
    229 /* Function prototypes ------------------------------------------------------ */
    230 
    231 /* see bocu1.c */
    232 U_CFUNC int32_t
    233 packDiff(int32_t diff);
    234 
    235 U_CFUNC int32_t
    236 encodeBocu1(int32_t *pPrev, int32_t c);
    237 
    238 U_CFUNC int32_t
    239 decodeBocu1(Bocu1Rx *pRx, uint8_t b);
    240 
    241 /* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */
    242 
    243 /* BOCU-1 implementation functions ------------------------------------------ */
    244 
    245 /**
    246  * Compute the next "previous" value for differencing
    247  * from the current code point.
    248  *
    249  * @param c current code point, 0..0x10ffff
    250  * @return "previous code point" state value
    251  */
    252 static int32_t
    253 bocu1Prev(int32_t c) {
    254     /* compute new prev */
    255     if(0x3040<=c && c<=0x309f) {
    256         /* Hiragana is not 128-aligned */
    257         return 0x3070;
    258     } else if(0x4e00<=c && c<=0x9fa5) {
    259         /* CJK Unihan */
    260         return 0x4e00-BOCU1_REACH_NEG_2;
    261     } else if(0xac00<=c && c<=0xd7a3) {
    262         /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compilers) */
    263         return ((int32_t)0xd7a3+(int32_t)0xac00)/2;
    264     } else {
    265         /* mostly small scripts */
    266         return (c&~0x7f)+BOCU1_ASCII_PREV;
    267     }
    268 }
    269 
    270 /**
    271  * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes
    272  * and return a packed integer with them.
    273  *
    274  * The encoding favors small absolut differences with short encodings
    275  * to compress runs of same-script characters.
    276  *
    277  * @param diff difference value -0x10ffff..0x10ffff
    278  * @return
    279  *      0x010000zz for 1-byte sequence zz
    280  *      0x0200yyzz for 2-byte sequence yy zz
    281  *      0x03xxyyzz for 3-byte sequence xx yy zz
    282  *      0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03)
    283  */
    284 U_CFUNC int32_t
    285 packDiff(int32_t diff) {
    286     int32_t result, m, lead, count, shift;
    287 
    288     if(diff>=BOCU1_REACH_NEG_1) {
    289         /* mostly positive differences, and single-byte negative ones */
    290         if(diff<=BOCU1_REACH_POS_1) {
    291             /* single byte */
    292             return 0x01000000|(BOCU1_MIDDLE+diff);
    293         } else if(diff<=BOCU1_REACH_POS_2) {
    294             /* two bytes */
    295             diff-=BOCU1_REACH_POS_1+1;
    296             lead=BOCU1_START_POS_2;
    297             count=1;
    298         } else if(diff<=BOCU1_REACH_POS_3) {
    299             /* three bytes */
    300             diff-=BOCU1_REACH_POS_2+1;
    301             lead=BOCU1_START_POS_3;
    302             count=2;
    303         } else {
    304             /* four bytes */
    305             diff-=BOCU1_REACH_POS_3+1;
    306             lead=BOCU1_START_POS_4;
    307             count=3;
    308         }
    309     } else {
    310         /* two- and four-byte negative differences */
    311         if(diff>=BOCU1_REACH_NEG_2) {
    312             /* two bytes */
    313             diff-=BOCU1_REACH_NEG_1;
    314             lead=BOCU1_START_NEG_2;
    315             count=1;
    316         } else if(diff>=BOCU1_REACH_NEG_3) {
    317             /* three bytes */
    318             diff-=BOCU1_REACH_NEG_2;
    319             lead=BOCU1_START_NEG_3;
    320             count=2;
    321         } else {
    322             /* four bytes */
    323             diff-=BOCU1_REACH_NEG_3;
    324             lead=BOCU1_START_NEG_4;
    325             count=3;
    326         }
    327     }
    328 
    329     /* encode the length of the packed result */
    330     if(count<3) {
    331         result=(count+1)<<24;
    332     } else /* count==3, MSB used for the lead byte */ {
    333         result=0;
    334     }
    335 
    336     /* calculate trail bytes like digits in itoa() */
    337     shift=0;
    338     do {
    339         NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m);
    340         result|=BOCU1_TRAIL_TO_BYTE(m)<<shift;
    341         shift+=8;
    342     } while(--count>0);
    343 
    344     /* add lead byte */
    345     result|=(lead+diff)<<shift;
    346 
    347     return result;
    348 }
    349 
    350 /**
    351  * BOCU-1 encoder function.
    352  *
    353  * @param pPrev pointer to the integer that holds
    354  *        the "previous code point" state;
    355  *        the initial value should be 0 which
    356  *        encodeBocu1 will set to the actual BOCU-1 initial state value
    357  * @param c the code point to encode
    358  * @return the packed 1/2/3/4-byte encoding, see packDiff(),
    359  *         or 0 if an error occurs
    360  *
    361  * @see packDiff
    362  */
    363 U_CFUNC int32_t
    364 encodeBocu1(int32_t *pPrev, int32_t c) {
    365     int32_t prev;
    366 
    367     if(pPrev==NULL || c<0 || c>0x10ffff) {
    368         /* illegal argument */
    369         return 0;
    370     }
    371 
    372     prev=*pPrev;
    373     if(prev==0) {
    374         /* lenient handling of initial value 0 */
    375         prev=*pPrev=BOCU1_ASCII_PREV;
    376     }
    377 
    378     if(c<=0x20) {
    379         /*
    380          * ISO C0 control & space:
    381          * Encode directly for MIME compatibility,
    382          * and reset state except for space, to not disrupt compression.
    383          */
    384         if(c!=0x20) {
    385             *pPrev=BOCU1_ASCII_PREV;
    386         }
    387         return 0x01000000|c;
    388     }
    389 
    390     /*
    391      * all other Unicode code points c==U+0021..U+10ffff
    392      * are encoded with the difference c-prev
    393      *
    394      * a new prev is computed from c,
    395      * placed in the middle of a 0x80-block (for most small scripts) or
    396      * in the middle of the Unihan and Hangul blocks
    397      * to statistically minimize the following difference
    398      */
    399     *pPrev=bocu1Prev(c);
    400     return packDiff(c-prev);
    401 }
    402 
    403 /**
    404  * Function for BOCU-1 decoder; handles multi-byte lead bytes.
    405  *
    406  * @param pRx pointer to the decoder state structure
    407  * @param b lead byte;
    408  *          BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD
    409  * @return -1 (state change only)
    410  *
    411  * @see decodeBocu1
    412  */
    413 static int32_t
    414 decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) {
    415     int32_t c, count;
    416 
    417     if(b>=BOCU1_START_NEG_2) {
    418         /* positive difference */
    419         if(b<BOCU1_START_POS_3) {
    420             /* two bytes */
    421             c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1;
    422             count=1;
    423         } else if(b<BOCU1_START_POS_4) {
    424             /* three bytes */
    425             c=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1;
    426             count=2;
    427         } else {
    428             /* four bytes */
    429             c=BOCU1_REACH_POS_3+1;
    430             count=3;
    431         }
    432     } else {
    433         /* negative difference */
    434         if(b>=BOCU1_START_NEG_3) {
    435             /* two bytes */
    436             c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1;
    437             count=1;
    438         } else if(b>BOCU1_MIN) {
    439             /* three bytes */
    440             c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2;
    441             count=2;
    442         } else {
    443             /* four bytes */
    444             c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3;
    445             count=3;
    446         }
    447     }
    448 
    449     /* set the state for decoding the trail byte(s) */
    450     pRx->diff=c;
    451     pRx->count=count;
    452     return -1;
    453 }
    454 
    455 /**
    456  * Function for BOCU-1 decoder; handles multi-byte trail bytes.
    457  *
    458  * @param pRx pointer to the decoder state structure
    459  * @param b trail byte
    460  * @return result value, same as decodeBocu1
    461  *
    462  * @see decodeBocu1
    463  */
    464 static int32_t
    465 decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) {
    466     int32_t t, c, count;
    467 
    468     if(b<=0x20) {
    469         /* skip some C0 controls and make the trail byte range contiguous */
    470         t=bocu1ByteToTrail[b];
    471         if(t<0) {
    472             /* illegal trail byte value */
    473             pRx->prev=BOCU1_ASCII_PREV;
    474             pRx->count=0;
    475             return -99;
    476         }
    477 #if BOCU1_MAX_TRAIL<0xff
    478     } else if(b>BOCU1_MAX_TRAIL) {
    479         return -99;
    480 #endif
    481     } else {
    482         t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET;
    483     }
    484 
    485     /* add trail byte into difference and decrement count */
    486     c=pRx->diff;
    487     count=pRx->count;
    488 
    489     if(count==1) {
    490         /* final trail byte, deliver a code point */
    491         c=pRx->prev+c+t;
    492         if(0<=c && c<=0x10ffff) {
    493             /* valid code point result */
    494             pRx->prev=bocu1Prev(c);
    495             pRx->count=0;
    496             return c;
    497         } else {
    498             /* illegal code point result */
    499             pRx->prev=BOCU1_ASCII_PREV;
    500             pRx->count=0;
    501             return -99;
    502         }
    503     }
    504 
    505     /* intermediate trail byte */
    506     if(count==2) {
    507         pRx->diff=c+t*BOCU1_TRAIL_COUNT;
    508     } else /* count==3 */ {
    509         pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT;
    510     }
    511     pRx->count=count-1;
    512     return -1;
    513 }
    514 
    515 /**
    516  * BOCU-1 decoder function.
    517  *
    518  * @param pRx pointer to the decoder state structure;
    519  *        the initial values should be 0 which
    520  *        decodeBocu1 will set to actual initial state values
    521  * @param b an input byte
    522  * @return
    523  *      0..0x10ffff for a result code point
    524  *      -1 if only the state changed without code point output
    525  *     <-1 if an error occurs
    526  */
    527 U_CFUNC int32_t
    528 decodeBocu1(Bocu1Rx *pRx, uint8_t b) {
    529     int32_t prev, c, count;
    530 
    531     if(pRx==NULL) {
    532         /* illegal argument */
    533         return -99;
    534     }
    535 
    536     prev=pRx->prev;
    537     if(prev==0) {
    538         /* lenient handling of initial 0 values */
    539         prev=pRx->prev=BOCU1_ASCII_PREV;
    540         count=pRx->count=0;
    541     } else {
    542         count=pRx->count;
    543     }
    544 
    545     if(count==0) {
    546         /* byte in lead position */
    547         if(b<=0x20) {
    548             /*
    549              * Direct-encoded C0 control code or space.
    550              * Reset prev for C0 control codes but not for space.
    551              */
    552             if(b!=0x20) {
    553                 pRx->prev=BOCU1_ASCII_PREV;
    554             }
    555             return b;
    556         }
    557 
    558         /*
    559          * b is a difference lead byte.
    560          *
    561          * Return a code point directly from a single-byte difference.
    562          *
    563          * For multi-byte difference lead bytes, set the decoder state
    564          * with the partial difference value from the lead byte and
    565          * with the number of trail bytes.
    566          *
    567          * For four-byte differences, the signedness also affects the
    568          * first trail byte, which has special handling farther below.
    569          */
    570         if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) {
    571             /* single-byte difference */
    572             c=prev+((int32_t)b-BOCU1_MIDDLE);
    573             pRx->prev=bocu1Prev(c);
    574             return c;
    575         } else if(b==BOCU1_RESET) {
    576             /* only reset the state, no code point */
    577             pRx->prev=BOCU1_ASCII_PREV;
    578             return -1;
    579         } else {
    580             return decodeBocu1LeadByte(pRx, b);
    581         }
    582     } else {
    583         /* trail byte in any position */
    584         return decodeBocu1TrailByte(pRx, b);
    585     }
    586 }
    587 
    588 /* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */
    589 
    590 /* test code ---------------------------------------------------------------- */
    591 
    592 /* test code options */
    593 
    594 /* ignore comma when processing name lists in testText() */
    595 #define TEST_IGNORE_COMMA       1
    596 
    597 /**
    598  * Write a packed BOCU-1 byte sequence into a byte array,
    599  * without overflow check.
    600  * Test function.
    601  *
    602  * @param packed packed BOCU-1 byte sequence, see packDiff()
    603  * @param p pointer to byte array
    604  * @return number of bytes
    605  *
    606  * @see packDiff
    607  */
    608 static int32_t
    609 writePacked(int32_t packed, uint8_t *p) {
    610     int32_t count=BOCU1_LENGTH_FROM_PACKED(packed);
    611     switch(count) {
    612     case 4:
    613         *p++=(uint8_t)(packed>>24);
    614     case 3:
    615         *p++=(uint8_t)(packed>>16);
    616     case 2:
    617         *p++=(uint8_t)(packed>>8);
    618     case 1:
    619         *p++=(uint8_t)packed;
    620     default:
    621         break;
    622     }
    623 
    624     return count;
    625 }
    626 
    627 /**
    628  * Unpack a packed BOCU-1 non-C0/space byte sequence and get
    629  * the difference to initialPrev.
    630  * Used only for round-trip testing of the difference encoding and decoding.
    631  * Test function.
    632  *
    633  * @param initialPrev bogus "previous code point" value to make sure that
    634  *                    the resulting code point is in the range 0..0x10ffff
    635  * @param packed packed BOCU-1 byte sequence
    636  * @return the difference to initialPrev
    637  *
    638  * @see packDiff
    639  * @see writeDiff
    640  */
    641 static int32_t
    642 unpackDiff(int32_t initialPrev, int32_t packed) {
    643     Bocu1Rx rx={ 0, 0, 0 };
    644     int32_t count;
    645 
    646     rx.prev=initialPrev;
    647     count=BOCU1_LENGTH_FROM_PACKED(packed);
    648     switch(count) {
    649     case 4:
    650         decodeBocu1(&rx, (uint8_t)(packed>>24));
    651     case 3:
    652         decodeBocu1(&rx, (uint8_t)(packed>>16));
    653     case 2:
    654         decodeBocu1(&rx, (uint8_t)(packed>>8));
    655     case 1:
    656         /* subtract initial prev */
    657         return decodeBocu1(&rx, (uint8_t)packed)-initialPrev;
    658     default:
    659         return -0x7fffffff;
    660     }
    661 }
    662 
    663 /**
    664  * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes,
    665  * preserving lexical order.
    666  * Also checks for roundtripping of the difference encoding.
    667  * Test function.
    668  *
    669  * @param diff difference value to test, -0x10ffff..0x10ffff
    670  * @param p pointer to output byte array
    671  * @return p advanced by number of bytes output
    672  *
    673  * @see unpackDiff
    674  */
    675 static uint8_t *
    676 writeDiff(int32_t diff, uint8_t *p) {
    677     /* generate the difference as a packed value and serialize it */
    678     int32_t packed, initialPrev;
    679 
    680     packed=packDiff(diff);
    681 
    682     /*
    683      * bogus initial "prev" to work around
    684      * code point range check in decodeBocu1()
    685      */
    686     if(diff<=0) {
    687         initialPrev=0x10ffff;
    688     } else {
    689         initialPrev=-1;
    690     }
    691 
    692     if(diff!=unpackDiff(initialPrev, packed)) {
    693         log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n",
    694                 diff, packed, unpackDiff(initialPrev, packed));
    695     }
    696     return p+writePacked(packed, p);
    697 }
    698 
    699 /**
    700  * Encode a UTF-16 string in BOCU-1.
    701  * Does not check for overflows, but otherwise useful function.
    702  *
    703  * @param s input UTF-16 string
    704  * @param length number of UChar code units in s
    705  * @param p pointer to output byte array
    706  * @return number of bytes output
    707  */
    708 static int32_t
    709 writeString(const UChar *s, int32_t length, uint8_t *p) {
    710     uint8_t *p0;
    711     int32_t c, prev, i;
    712 
    713     prev=0;
    714     p0=p;
    715     i=0;
    716     while(i<length) {
    717         U16_NEXT(s, i, length, c);
    718         p+=writePacked(encodeBocu1(&prev, c), p);
    719     }
    720     return (int32_t)(p-p0);
    721 }
    722 
    723 /**
    724  * Decode a BOCU-1 byte sequence to a UTF-16 string.
    725  * Does not check for overflows, but otherwise useful function.
    726  *
    727  * @param p pointer to input BOCU-1 bytes
    728  * @param length number of input bytes
    729  * @param s point to output UTF-16 string array
    730  * @return number of UChar code units output
    731  */
    732 static int32_t
    733 readString(const uint8_t *p, int32_t length, UChar *s) {
    734     Bocu1Rx rx={ 0, 0, 0 };
    735     int32_t c, i, sLength;
    736 
    737     i=sLength=0;
    738     while(i<length) {
    739         c=decodeBocu1(&rx, p[i++]);
    740         if(c<-1) {
    741             log_err("error: readString detects encoding error at string index %ld\n", i);
    742             return -1;
    743         }
    744         if(c>=0) {
    745             U16_APPEND_UNSAFE(s, sLength, c);
    746         }
    747     }
    748     return sLength;
    749 }
    750 
    751 static char
    752 hexDigit(uint8_t digit) {
    753     return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit);
    754 }
    755 
    756 /**
    757  * Pretty-print 0-terminated byte values.
    758  * Helper function for test output.
    759  *
    760  * @param bytes 0-terminated byte array to print
    761  */
    762 static void
    763 printBytes(uint8_t *bytes, char *out) {
    764     int i;
    765     uint8_t b;
    766 
    767     i=0;
    768     while((b=*bytes++)!=0) {
    769         *out++=' ';
    770         *out++=hexDigit((uint8_t)(b>>4));
    771         *out++=hexDigit((uint8_t)(b&0xf));
    772         ++i;
    773     }
    774     i=3*(5-i);
    775     while(i>0) {
    776         *out++=' ';
    777         --i;
    778     }
    779     *out=0;
    780 }
    781 
    782 /**
    783  * Basic BOCU-1 test function, called when there are no command line arguments.
    784  * Prints some of the #define values and performs round-trip tests of the
    785  * difference encoding and decoding.
    786  */
    787 static void
    788 TestBOCU1RefDiff(void) {
    789     char buf1[80], buf2[80];
    790     uint8_t prev[5], level[5];
    791     int32_t i, cmp, countErrors;
    792 
    793     log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1);
    794     log_verbose("reach of 2 bytes     : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2);
    795     log_verbose("reach of 3 bytes     : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3);
    796 
    797     log_verbose("    BOCU1_REACH_NEG_1 %8ld    BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1);
    798     log_verbose("    BOCU1_REACH_NEG_2 %8ld    BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2);
    799     log_verbose("    BOCU1_REACH_NEG_3 %8ld    BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3);
    800 
    801     log_verbose("    BOCU1_MIDDLE      0x%02x\n", BOCU1_MIDDLE);
    802     log_verbose("    BOCU1_START_NEG_2 0x%02x    BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2);
    803     log_verbose("    BOCU1_START_NEG_3 0x%02x    BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3);
    804 
    805     /* test packDiff() & unpackDiff() with some specific values */
    806     writeDiff(0, level);
    807     writeDiff(1, level);
    808     writeDiff(65, level);
    809     writeDiff(130, level);
    810     writeDiff(30000, level);
    811     writeDiff(1000000, level);
    812     writeDiff(-65, level);
    813     writeDiff(-130, level);
    814     writeDiff(-30000, level);
    815     writeDiff(-1000000, level);
    816 
    817     /* test that each value is smaller than any following one */
    818     countErrors=0;
    819     i=-0x10ffff;
    820     *writeDiff(i, prev)=0;
    821 
    822     /* show first number and bytes */
    823     printBytes(prev, buf1);
    824     log_verbose("              wD(%8ld)                    %s\n", i, buf1);
    825 
    826     for(++i; i<=0x10ffff; ++i) {
    827         *writeDiff(i, level)=0;
    828         cmp=strcmp((const char *)prev, (const char *)level);
    829         if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) {
    830             log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n",
    831                    level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i);
    832         }
    833         if(cmp<0) {
    834             if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) {
    835                 /*
    836                  * if the result is good, then print only if the length changed
    837                  * to get little but interesting output
    838                  */
    839                 printBytes(prev, buf1);
    840                 printBytes(level, buf2);
    841                 log_verbose("ok:    strcmp(wD(%8ld), wD(%8ld))=%2d  %s%s\n", i-1, i, cmp, buf1, buf2);
    842             }
    843         } else {
    844             ++countErrors;
    845             printBytes(prev, buf1);
    846             printBytes(level, buf2);
    847             log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d  %s%s\n", i-1, i, cmp, buf1, buf2);
    848         }
    849         /* remember the previous bytes */
    850         memcpy(prev, level, 4);
    851     }
    852 
    853     /* show last number and bytes */
    854     printBytes((uint8_t *)"", buf1);
    855     printBytes(prev, buf2);
    856     log_verbose("                            wD(%8ld)      %s%s\n", i-1, buf1, buf2);
    857 
    858     if(countErrors==0) {
    859         log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n");
    860     } else {
    861         log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors);
    862     }
    863 
    864     /* output signature byte sequence */
    865     i=0;
    866     writePacked(encodeBocu1(&i, 0xfeff), level);
    867     log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n",
    868             level[0], level[1], level[2]);
    869 }
    870 
    871 /* cintltst code ------------------------------------------------------------ */
    872 
    873 static const int32_t DEFAULT_BUFFER_SIZE = 30000;
    874 
    875 
    876 /* test one string with the ICU and the reference BOCU-1 implementations */
    877 static void
    878 roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) {
    879     UChar *roundtripRef, *roundtripICU;
    880     char *bocu1Ref, *bocu1ICU;
    881 
    882     int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength;
    883     UErrorCode errorCode;
    884 
    885     roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
    886     roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
    887     bocu1Ref = malloc(DEFAULT_BUFFER_SIZE);
    888     bocu1ICU = malloc(DEFAULT_BUFFER_SIZE);
    889 
    890     /* Unicode -> BOCU-1 */
    891     bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref);
    892 
    893     errorCode=U_ZERO_ERROR;
    894     bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, length, &errorCode);
    895     if(U_FAILURE(errorCode)) {
    896         log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
    897         goto cleanup;
    898     }
    899 
    900     if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) {
    901         log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength);
    902         goto cleanup;
    903     }
    904 
    905     /* BOCU-1 -> Unicode */
    906     roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef);
    907     if(roundtripRefLength<0) {
    908         goto cleanup; /* readString() found an error and reported it */
    909     }
    910 
    911     roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, bocu1ICU, bocu1ICULength, &errorCode);
    912     if(U_FAILURE(errorCode)) {
    913         log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode));
    914         goto cleanup;
    915     }
    916 
    917     if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) {
    918         log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength);
    919         goto cleanup;
    920     }
    921     if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) {
    922         log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength);
    923         goto cleanup;
    924     }
    925 cleanup:
    926     free(roundtripRef);
    927     free(roundtripICU);
    928     free(bocu1Ref);
    929     free(bocu1ICU);
    930 }
    931 
    932 static const UChar feff[]={ 0xfeff };
    933 static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 };
    934 static const UChar crlf[]={ 0xd, 0xa, 0x20 };
    935 static const UChar nul[]={ 0 };
    936 static const UChar latin[]={ 0xdf, 0xe6 };
    937 static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 };
    938 static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 };
    939 static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 };
    940 static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 };
    941 static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */
    942 static const UChar plane1[]={ 0xd800, 0xdc00 };
    943 static const UChar plane2[]={ 0xd845, 0xdddd };
    944 static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 };
    945 static const UChar plane16[]={ 0xdbff, 0xdfff };
    946 static const UChar c0[]={ 1, 0xe40, 0x20, 9 };
    947 
    948 static const struct {
    949     const UChar *s;
    950     int32_t length;
    951 } strings[]={
    952     { feff,         UPRV_LENGTHOF(feff) },
    953     { ascii,        UPRV_LENGTHOF(ascii) },
    954     { crlf,         UPRV_LENGTHOF(crlf) },
    955     { nul,          UPRV_LENGTHOF(nul) },
    956     { latin,        UPRV_LENGTHOF(latin) },
    957     { devanagari,   UPRV_LENGTHOF(devanagari) },
    958     { hiragana,     UPRV_LENGTHOF(hiragana) },
    959     { unihan,       UPRV_LENGTHOF(unihan) },
    960     { hangul,       UPRV_LENGTHOF(hangul) },
    961     { surrogates,   UPRV_LENGTHOF(surrogates) },
    962     { plane1,       UPRV_LENGTHOF(plane1) },
    963     { plane2,       UPRV_LENGTHOF(plane2) },
    964     { plane15,      UPRV_LENGTHOF(plane15) },
    965     { plane16,      UPRV_LENGTHOF(plane16) },
    966     { c0,           UPRV_LENGTHOF(c0) }
    967 };
    968 
    969 /*
    970  * Verify that the ICU BOCU-1 implementation produces the same results as
    971  * the reference implementation from the design folder.
    972  * Generate some texts and convert them with both converters, verifying
    973  * identical results and roundtripping.
    974  */
    975 static void
    976 TestBOCU1(void) {
    977     UChar *text;
    978     int32_t i, length;
    979 
    980     UConverter *bocu1;
    981     UErrorCode errorCode;
    982 
    983     errorCode=U_ZERO_ERROR;
    984     bocu1=ucnv_open("BOCU-1", &errorCode);
    985     if(U_FAILURE(errorCode)) {
    986         log_data_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode));
    987         return;
    988     }
    989 
    990     text = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar));
    991 
    992     /* text 1: each of strings[] once */
    993     length=0;
    994     for(i=0; i<UPRV_LENGTHOF(strings); ++i) {
    995         u_memcpy(text+length, strings[i].s, strings[i].length);
    996         length+=strings[i].length;
    997     }
    998     roundtripBOCU1(bocu1, 1, text, length);
    999 
   1000     /* text 2: each of strings[] twice */
   1001     length=0;
   1002     for(i=0; i<UPRV_LENGTHOF(strings); ++i) {
   1003         u_memcpy(text+length, strings[i].s, strings[i].length);
   1004         length+=strings[i].length;
   1005         u_memcpy(text+length, strings[i].s, strings[i].length);
   1006         length+=strings[i].length;
   1007     }
   1008     roundtripBOCU1(bocu1, 2, text, length);
   1009 
   1010     /* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */
   1011     length=0;
   1012     for(i=1; length<5000; i+=7) {
   1013         if(i>=UPRV_LENGTHOF(strings)) {
   1014             i-=UPRV_LENGTHOF(strings);
   1015         }
   1016         u_memcpy(text+length, strings[i].s, strings[i].length);
   1017         length+=strings[i].length;
   1018     }
   1019     roundtripBOCU1(bocu1, 3, text, length);
   1020 
   1021     ucnv_close(bocu1);
   1022     free(text);
   1023 }
   1024 
   1025 U_CFUNC void addBOCU1Tests(TestNode** root);
   1026 
   1027 U_CFUNC void
   1028 addBOCU1Tests(TestNode** root) {
   1029     addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff");
   1030     addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1");
   1031 }
   1032