1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2002-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: bocu1tst.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002may27 14 * created by: Markus W. Scherer 15 * 16 * This is the reference implementation of BOCU-1, 17 * the MIME-friendly form of the Binary Ordered Compression for Unicode, 18 * taken directly from ### http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/bocu1/ 19 * The files bocu1.h and bocu1.c from the design folder are taken 20 * verbatim (minus copyright and #include) and copied together into this file. 21 * The reference code and some of the reference bocu1tst.c 22 * is modified to run as part of the ICU cintltst 23 * test framework (minus main(), log_ln() etc. instead of printf()). 24 * 25 * This reference implementation is used here to verify 26 * the ICU BOCU-1 implementation, which is 27 * adapted for ICU conversion APIs and optimized. 28 * ### links in design doc to here and to ucnvbocu.c 29 */ 30 31 #include "unicode/utypes.h" 32 #include "unicode/ustring.h" 33 #include "unicode/ucnv.h" 34 #include "cmemory.h" 35 #include "cintltst.h" 36 37 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) 38 39 /* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */ 40 41 /* BOCU-1 constants and macros ---------------------------------------------- */ 42 43 /* 44 * BOCU-1 encodes the code points of a Unicode string as 45 * a sequence of byte-encoded differences (slope detection), 46 * preserving lexical order. 47 * 48 * Optimize the difference-taking for runs of Unicode text within 49 * small scripts: 50 * 51 * Most small scripts are allocated within aligned 128-blocks of Unicode 52 * code points. Lexical order is preserved if the "previous code point" state 53 * is always moved into the middle of such a block. 54 * 55 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul 56 * areas into the middle of those areas. 57 * 58 * C0 control codes and space are encoded with their US-ASCII bytes. 59 * "prev" is reset for C0 controls but not for space. 60 */ 61 62 /* initial value for "prev": middle of the ASCII range */ 63 #define BOCU1_ASCII_PREV 0x40 64 65 /* bounding byte values for differences */ 66 #define BOCU1_MIN 0x21 67 #define BOCU1_MIDDLE 0x90 68 #define BOCU1_MAX_LEAD 0xfe 69 70 /* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */ 71 #define BOCU1_MAX_TRAIL 0xffL 72 #define BOCU1_RESET 0xff 73 74 /* number of lead bytes */ 75 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) 76 77 /* adjust trail byte counts for the use of some C0 control byte values */ 78 #define BOCU1_TRAIL_CONTROLS_COUNT 20 79 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) 80 81 /* number of trail bytes */ 82 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) 83 84 /* 85 * number of positive and negative single-byte codes 86 * (counting 0==BOCU1_MIDDLE among the positive ones) 87 */ 88 #define BOCU1_SINGLE 64 89 90 /* number of lead bytes for positive and negative 2/3/4-byte sequences */ 91 #define BOCU1_LEAD_2 43 92 #define BOCU1_LEAD_3 3 93 #define BOCU1_LEAD_4 1 94 95 /* The difference value range for single-byters. */ 96 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) 97 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) 98 99 /* The difference value range for double-byters. */ 100 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) 101 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) 102 103 /* The difference value range for 3-byters. */ 104 #define BOCU1_REACH_POS_3 \ 105 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) 106 107 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) 108 109 /* The lead byte start values. */ 110 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) 111 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) 112 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) 113 /* ==BOCU1_MAX_LEAD */ 114 115 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) 116 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) 117 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) 118 /* ==BOCU1_MIN+1 */ 119 120 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ 121 #define BOCU1_LENGTH_FROM_LEAD(lead) \ 122 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \ 123 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \ 124 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4) 125 126 /* The length of a byte sequence, according to its packed form. */ 127 #define BOCU1_LENGTH_FROM_PACKED(packed) \ 128 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4) 129 130 /* 131 * 12 commonly used C0 control codes (and space) are only used to encode 132 * themselves directly, 133 * which makes BOCU-1 MIME-usable and reasonably safe for 134 * ASCII-oriented software. 135 * 136 * These controls are 137 * 0 NUL 138 * 139 * 7 BEL 140 * 8 BS 141 * 142 * 9 TAB 143 * a LF 144 * b VT 145 * c FF 146 * d CR 147 * 148 * e SO 149 * f SI 150 * 151 * 1a SUB 152 * 1b ESC 153 * 154 * The other 20 C0 controls are also encoded directly (to preserve order) 155 * but are also used as trail bytes in difference encoding 156 * (for better compression). 157 */ 158 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) 159 160 /* 161 * Byte value map for control codes, 162 * from external byte values 0x00..0x20 163 * to trail byte values 0..19 (0..0x13) as used in the difference calculation. 164 * External byte values that are illegal as trail bytes are mapped to -1. 165 */ 166 static const int8_t 167 bocu1ByteToTrail[BOCU1_MIN]={ 168 /* 0 1 2 3 4 5 6 7 */ 169 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, 170 171 /* 8 9 a b c d e f */ 172 -1, -1, -1, -1, -1, -1, -1, -1, 173 174 /* 10 11 12 13 14 15 16 17 */ 175 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 176 177 /* 18 19 1a 1b 1c 1d 1e 1f */ 178 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, 179 180 /* 20 */ 181 -1 182 }; 183 184 /* 185 * Byte value map for control codes, 186 * from trail byte values 0..19 (0..0x13) as used in the difference calculation 187 * to external byte values 0x00..0x20. 188 */ 189 static const int8_t 190 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ 191 /* 0 1 2 3 4 5 6 7 */ 192 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, 193 194 /* 8 9 a b c d e f */ 195 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 196 197 /* 10 11 12 13 */ 198 0x1c, 0x1d, 0x1e, 0x1f 199 }; 200 201 /** 202 * Integer division and modulo with negative numerators 203 * yields negative modulo results and quotients that are one more than 204 * what we need here. 205 * This macro adjust the results so that the modulo-value m is always >=0. 206 * 207 * For positive n, the if() condition is always FALSE. 208 * 209 * @param n Number to be split into quotient and rest. 210 * Will be modified to contain the quotient. 211 * @param d Divisor. 212 * @param m Output variable for the rest (modulo result). 213 */ 214 #define NEGDIVMOD(n, d, m) { \ 215 (m)=(n)%(d); \ 216 (n)/=(d); \ 217 if((m)<0) { \ 218 --(n); \ 219 (m)+=(d); \ 220 } \ 221 } 222 223 /* State for BOCU-1 decoder function. */ 224 struct Bocu1Rx { 225 int32_t prev, count, diff; 226 }; 227 228 typedef struct Bocu1Rx Bocu1Rx; 229 230 /* Function prototypes ------------------------------------------------------ */ 231 232 /* see bocu1.c */ 233 U_CFUNC int32_t 234 packDiff(int32_t diff); 235 236 U_CFUNC int32_t 237 encodeBocu1(int32_t *pPrev, int32_t c); 238 239 U_CFUNC int32_t 240 decodeBocu1(Bocu1Rx *pRx, uint8_t b); 241 242 /* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */ 243 244 /* BOCU-1 implementation functions ------------------------------------------ */ 245 246 /** 247 * Compute the next "previous" value for differencing 248 * from the current code point. 249 * 250 * @param c current code point, 0..0x10ffff 251 * @return "previous code point" state value 252 */ 253 static U_INLINE int32_t 254 bocu1Prev(int32_t c) { 255 /* compute new prev */ 256 if(0x3040<=c && c<=0x309f) { 257 /* Hiragana is not 128-aligned */ 258 return 0x3070; 259 } else if(0x4e00<=c && c<=0x9fa5) { 260 /* CJK Unihan */ 261 return 0x4e00-BOCU1_REACH_NEG_2; 262 } else if(0xac00<=c && c<=0xd7a3) { 263 /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compilers) */ 264 return ((int32_t)0xd7a3+(int32_t)0xac00)/2; 265 } else { 266 /* mostly small scripts */ 267 return (c&~0x7f)+BOCU1_ASCII_PREV; 268 } 269 } 270 271 /** 272 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes 273 * and return a packed integer with them. 274 * 275 * The encoding favors small absolut differences with short encodings 276 * to compress runs of same-script characters. 277 * 278 * @param diff difference value -0x10ffff..0x10ffff 279 * @return 280 * 0x010000zz for 1-byte sequence zz 281 * 0x0200yyzz for 2-byte sequence yy zz 282 * 0x03xxyyzz for 3-byte sequence xx yy zz 283 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) 284 */ 285 U_CFUNC int32_t 286 packDiff(int32_t diff) { 287 int32_t result, m, lead, count, shift; 288 289 if(diff>=BOCU1_REACH_NEG_1) { 290 /* mostly positive differences, and single-byte negative ones */ 291 if(diff<=BOCU1_REACH_POS_1) { 292 /* single byte */ 293 return 0x01000000|(BOCU1_MIDDLE+diff); 294 } else if(diff<=BOCU1_REACH_POS_2) { 295 /* two bytes */ 296 diff-=BOCU1_REACH_POS_1+1; 297 lead=BOCU1_START_POS_2; 298 count=1; 299 } else if(diff<=BOCU1_REACH_POS_3) { 300 /* three bytes */ 301 diff-=BOCU1_REACH_POS_2+1; 302 lead=BOCU1_START_POS_3; 303 count=2; 304 } else { 305 /* four bytes */ 306 diff-=BOCU1_REACH_POS_3+1; 307 lead=BOCU1_START_POS_4; 308 count=3; 309 } 310 } else { 311 /* two- and four-byte negative differences */ 312 if(diff>=BOCU1_REACH_NEG_2) { 313 /* two bytes */ 314 diff-=BOCU1_REACH_NEG_1; 315 lead=BOCU1_START_NEG_2; 316 count=1; 317 } else if(diff>=BOCU1_REACH_NEG_3) { 318 /* three bytes */ 319 diff-=BOCU1_REACH_NEG_2; 320 lead=BOCU1_START_NEG_3; 321 count=2; 322 } else { 323 /* four bytes */ 324 diff-=BOCU1_REACH_NEG_3; 325 lead=BOCU1_START_NEG_4; 326 count=3; 327 } 328 } 329 330 /* encode the length of the packed result */ 331 if(count<3) { 332 result=(count+1)<<24; 333 } else /* count==3, MSB used for the lead byte */ { 334 result=0; 335 } 336 337 /* calculate trail bytes like digits in itoa() */ 338 shift=0; 339 do { 340 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 341 result|=BOCU1_TRAIL_TO_BYTE(m)<<shift; 342 shift+=8; 343 } while(--count>0); 344 345 /* add lead byte */ 346 result|=(lead+diff)<<shift; 347 348 return result; 349 } 350 351 /** 352 * BOCU-1 encoder function. 353 * 354 * @param pPrev pointer to the integer that holds 355 * the "previous code point" state; 356 * the initial value should be 0 which 357 * encodeBocu1 will set to the actual BOCU-1 initial state value 358 * @param c the code point to encode 359 * @return the packed 1/2/3/4-byte encoding, see packDiff(), 360 * or 0 if an error occurs 361 * 362 * @see packDiff 363 */ 364 U_CFUNC int32_t 365 encodeBocu1(int32_t *pPrev, int32_t c) { 366 int32_t prev; 367 368 if(pPrev==NULL || c<0 || c>0x10ffff) { 369 /* illegal argument */ 370 return 0; 371 } 372 373 prev=*pPrev; 374 if(prev==0) { 375 /* lenient handling of initial value 0 */ 376 prev=*pPrev=BOCU1_ASCII_PREV; 377 } 378 379 if(c<=0x20) { 380 /* 381 * ISO C0 control & space: 382 * Encode directly for MIME compatibility, 383 * and reset state except for space, to not disrupt compression. 384 */ 385 if(c!=0x20) { 386 *pPrev=BOCU1_ASCII_PREV; 387 } 388 return 0x01000000|c; 389 } 390 391 /* 392 * all other Unicode code points c==U+0021..U+10ffff 393 * are encoded with the difference c-prev 394 * 395 * a new prev is computed from c, 396 * placed in the middle of a 0x80-block (for most small scripts) or 397 * in the middle of the Unihan and Hangul blocks 398 * to statistically minimize the following difference 399 */ 400 *pPrev=bocu1Prev(c); 401 return packDiff(c-prev); 402 } 403 404 /** 405 * Function for BOCU-1 decoder; handles multi-byte lead bytes. 406 * 407 * @param pRx pointer to the decoder state structure 408 * @param b lead byte; 409 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD 410 * @return -1 (state change only) 411 * 412 * @see decodeBocu1 413 */ 414 static int32_t 415 decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) { 416 int32_t c, count; 417 418 if(b>=BOCU1_START_NEG_2) { 419 /* positive difference */ 420 if(b<BOCU1_START_POS_3) { 421 /* two bytes */ 422 c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 423 count=1; 424 } else if(b<BOCU1_START_POS_4) { 425 /* three bytes */ 426 c=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1; 427 count=2; 428 } else { 429 /* four bytes */ 430 c=BOCU1_REACH_POS_3+1; 431 count=3; 432 } 433 } else { 434 /* negative difference */ 435 if(b>=BOCU1_START_NEG_3) { 436 /* two bytes */ 437 c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 438 count=1; 439 } else if(b>BOCU1_MIN) { 440 /* three bytes */ 441 c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2; 442 count=2; 443 } else { 444 /* four bytes */ 445 c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; 446 count=3; 447 } 448 } 449 450 /* set the state for decoding the trail byte(s) */ 451 pRx->diff=c; 452 pRx->count=count; 453 return -1; 454 } 455 456 /** 457 * Function for BOCU-1 decoder; handles multi-byte trail bytes. 458 * 459 * @param pRx pointer to the decoder state structure 460 * @param b trail byte 461 * @return result value, same as decodeBocu1 462 * 463 * @see decodeBocu1 464 */ 465 static int32_t 466 decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) { 467 int32_t t, c, count; 468 469 if(b<=0x20) { 470 /* skip some C0 controls and make the trail byte range contiguous */ 471 t=bocu1ByteToTrail[b]; 472 if(t<0) { 473 /* illegal trail byte value */ 474 pRx->prev=BOCU1_ASCII_PREV; 475 pRx->count=0; 476 return -99; 477 } 478 #if BOCU1_MAX_TRAIL<0xff 479 } else if(b>BOCU1_MAX_TRAIL) { 480 return -99; 481 #endif 482 } else { 483 t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET; 484 } 485 486 /* add trail byte into difference and decrement count */ 487 c=pRx->diff; 488 count=pRx->count; 489 490 if(count==1) { 491 /* final trail byte, deliver a code point */ 492 c=pRx->prev+c+t; 493 if(0<=c && c<=0x10ffff) { 494 /* valid code point result */ 495 pRx->prev=bocu1Prev(c); 496 pRx->count=0; 497 return c; 498 } else { 499 /* illegal code point result */ 500 pRx->prev=BOCU1_ASCII_PREV; 501 pRx->count=0; 502 return -99; 503 } 504 } 505 506 /* intermediate trail byte */ 507 if(count==2) { 508 pRx->diff=c+t*BOCU1_TRAIL_COUNT; 509 } else /* count==3 */ { 510 pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT; 511 } 512 pRx->count=count-1; 513 return -1; 514 } 515 516 /** 517 * BOCU-1 decoder function. 518 * 519 * @param pRx pointer to the decoder state structure; 520 * the initial values should be 0 which 521 * decodeBocu1 will set to actual initial state values 522 * @param b an input byte 523 * @return 524 * 0..0x10ffff for a result code point 525 * -1 if only the state changed without code point output 526 * <-1 if an error occurs 527 */ 528 U_CFUNC int32_t 529 decodeBocu1(Bocu1Rx *pRx, uint8_t b) { 530 int32_t prev, c, count; 531 532 if(pRx==NULL) { 533 /* illegal argument */ 534 return -99; 535 } 536 537 prev=pRx->prev; 538 if(prev==0) { 539 /* lenient handling of initial 0 values */ 540 prev=pRx->prev=BOCU1_ASCII_PREV; 541 count=pRx->count=0; 542 } else { 543 count=pRx->count; 544 } 545 546 if(count==0) { 547 /* byte in lead position */ 548 if(b<=0x20) { 549 /* 550 * Direct-encoded C0 control code or space. 551 * Reset prev for C0 control codes but not for space. 552 */ 553 if(b!=0x20) { 554 pRx->prev=BOCU1_ASCII_PREV; 555 } 556 return b; 557 } 558 559 /* 560 * b is a difference lead byte. 561 * 562 * Return a code point directly from a single-byte difference. 563 * 564 * For multi-byte difference lead bytes, set the decoder state 565 * with the partial difference value from the lead byte and 566 * with the number of trail bytes. 567 * 568 * For four-byte differences, the signedness also affects the 569 * first trail byte, which has special handling farther below. 570 */ 571 if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) { 572 /* single-byte difference */ 573 c=prev+((int32_t)b-BOCU1_MIDDLE); 574 pRx->prev=bocu1Prev(c); 575 return c; 576 } else if(b==BOCU1_RESET) { 577 /* only reset the state, no code point */ 578 pRx->prev=BOCU1_ASCII_PREV; 579 return -1; 580 } else { 581 return decodeBocu1LeadByte(pRx, b); 582 } 583 } else { 584 /* trail byte in any position */ 585 return decodeBocu1TrailByte(pRx, b); 586 } 587 } 588 589 /* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */ 590 591 /* test code ---------------------------------------------------------------- */ 592 593 /* test code options */ 594 595 /* ignore comma when processing name lists in testText() */ 596 #define TEST_IGNORE_COMMA 1 597 598 /** 599 * Write a packed BOCU-1 byte sequence into a byte array, 600 * without overflow check. 601 * Test function. 602 * 603 * @param packed packed BOCU-1 byte sequence, see packDiff() 604 * @param p pointer to byte array 605 * @return number of bytes 606 * 607 * @see packDiff 608 */ 609 static int32_t 610 writePacked(int32_t packed, uint8_t *p) { 611 int32_t count=BOCU1_LENGTH_FROM_PACKED(packed); 612 switch(count) { 613 case 4: 614 *p++=(uint8_t)(packed>>24); 615 case 3: 616 *p++=(uint8_t)(packed>>16); 617 case 2: 618 *p++=(uint8_t)(packed>>8); 619 case 1: 620 *p++=(uint8_t)packed; 621 default: 622 break; 623 } 624 625 return count; 626 } 627 628 /** 629 * Unpack a packed BOCU-1 non-C0/space byte sequence and get 630 * the difference to initialPrev. 631 * Used only for round-trip testing of the difference encoding and decoding. 632 * Test function. 633 * 634 * @param initialPrev bogus "previous code point" value to make sure that 635 * the resulting code point is in the range 0..0x10ffff 636 * @param packed packed BOCU-1 byte sequence 637 * @return the difference to initialPrev 638 * 639 * @see packDiff 640 * @see writeDiff 641 */ 642 static int32_t 643 unpackDiff(int32_t initialPrev, int32_t packed) { 644 Bocu1Rx rx={ 0, 0, 0 }; 645 int32_t count; 646 647 rx.prev=initialPrev; 648 count=BOCU1_LENGTH_FROM_PACKED(packed); 649 switch(count) { 650 case 4: 651 decodeBocu1(&rx, (uint8_t)(packed>>24)); 652 case 3: 653 decodeBocu1(&rx, (uint8_t)(packed>>16)); 654 case 2: 655 decodeBocu1(&rx, (uint8_t)(packed>>8)); 656 case 1: 657 /* subtract initial prev */ 658 return decodeBocu1(&rx, (uint8_t)packed)-initialPrev; 659 default: 660 return -0x7fffffff; 661 } 662 } 663 664 /** 665 * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes, 666 * preserving lexical order. 667 * Also checks for roundtripping of the difference encoding. 668 * Test function. 669 * 670 * @param diff difference value to test, -0x10ffff..0x10ffff 671 * @param p pointer to output byte array 672 * @return p advanced by number of bytes output 673 * 674 * @see unpackDiff 675 */ 676 static uint8_t * 677 writeDiff(int32_t diff, uint8_t *p) { 678 /* generate the difference as a packed value and serialize it */ 679 int32_t packed, initialPrev; 680 681 packed=packDiff(diff); 682 683 /* 684 * bogus initial "prev" to work around 685 * code point range check in decodeBocu1() 686 */ 687 if(diff<=0) { 688 initialPrev=0x10ffff; 689 } else { 690 initialPrev=-1; 691 } 692 693 if(diff!=unpackDiff(initialPrev, packed)) { 694 log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n", 695 diff, packed, unpackDiff(initialPrev, packed)); 696 } 697 return p+writePacked(packed, p); 698 } 699 700 /** 701 * Encode a UTF-16 string in BOCU-1. 702 * Does not check for overflows, but otherwise useful function. 703 * 704 * @param s input UTF-16 string 705 * @param length number of UChar code units in s 706 * @param p pointer to output byte array 707 * @return number of bytes output 708 */ 709 static int32_t 710 writeString(const UChar *s, int32_t length, uint8_t *p) { 711 uint8_t *p0; 712 int32_t c, prev, i; 713 714 prev=0; 715 p0=p; 716 i=0; 717 while(i<length) { 718 UTF_NEXT_CHAR(s, i, length, c); 719 p+=writePacked(encodeBocu1(&prev, c), p); 720 } 721 return (int32_t)(p-p0); 722 } 723 724 /** 725 * Decode a BOCU-1 byte sequence to a UTF-16 string. 726 * Does not check for overflows, but otherwise useful function. 727 * 728 * @param p pointer to input BOCU-1 bytes 729 * @param length number of input bytes 730 * @param s point to output UTF-16 string array 731 * @return number of UChar code units output 732 */ 733 static int32_t 734 readString(const uint8_t *p, int32_t length, UChar *s) { 735 Bocu1Rx rx={ 0, 0, 0 }; 736 int32_t c, i, sLength; 737 738 i=sLength=0; 739 while(i<length) { 740 c=decodeBocu1(&rx, p[i++]); 741 if(c<-1) { 742 log_err("error: readString detects encoding error at string index %ld\n", i); 743 return -1; 744 } 745 if(c>=0) { 746 UTF_APPEND_CHAR_UNSAFE(s, sLength, c); 747 } 748 } 749 return sLength; 750 } 751 752 static U_INLINE char 753 hexDigit(uint8_t digit) { 754 return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit); 755 } 756 757 /** 758 * Pretty-print 0-terminated byte values. 759 * Helper function for test output. 760 * 761 * @param bytes 0-terminated byte array to print 762 */ 763 static void 764 printBytes(uint8_t *bytes, char *out) { 765 int i; 766 uint8_t b; 767 768 i=0; 769 while((b=*bytes++)!=0) { 770 *out++=' '; 771 *out++=hexDigit((uint8_t)(b>>4)); 772 *out++=hexDigit((uint8_t)(b&0xf)); 773 ++i; 774 } 775 i=3*(5-i); 776 while(i>0) { 777 *out++=' '; 778 --i; 779 } 780 *out=0; 781 } 782 783 /** 784 * Basic BOCU-1 test function, called when there are no command line arguments. 785 * Prints some of the #define values and performs round-trip tests of the 786 * difference encoding and decoding. 787 */ 788 static void 789 TestBOCU1RefDiff(void) { 790 char buf1[80], buf2[80]; 791 uint8_t prev[5], level[5]; 792 int32_t i, cmp, countErrors; 793 794 log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1); 795 log_verbose("reach of 2 bytes : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2); 796 log_verbose("reach of 3 bytes : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3); 797 798 log_verbose(" BOCU1_REACH_NEG_1 %8ld BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1); 799 log_verbose(" BOCU1_REACH_NEG_2 %8ld BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2); 800 log_verbose(" BOCU1_REACH_NEG_3 %8ld BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3); 801 802 log_verbose(" BOCU1_MIDDLE 0x%02x\n", BOCU1_MIDDLE); 803 log_verbose(" BOCU1_START_NEG_2 0x%02x BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2); 804 log_verbose(" BOCU1_START_NEG_3 0x%02x BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3); 805 806 /* test packDiff() & unpackDiff() with some specific values */ 807 writeDiff(0, level); 808 writeDiff(1, level); 809 writeDiff(65, level); 810 writeDiff(130, level); 811 writeDiff(30000, level); 812 writeDiff(1000000, level); 813 writeDiff(-65, level); 814 writeDiff(-130, level); 815 writeDiff(-30000, level); 816 writeDiff(-1000000, level); 817 818 /* test that each value is smaller than any following one */ 819 countErrors=0; 820 i=-0x10ffff; 821 *writeDiff(i, prev)=0; 822 823 /* show first number and bytes */ 824 printBytes(prev, buf1); 825 log_verbose(" wD(%8ld) %s\n", i, buf1); 826 827 for(++i; i<=0x10ffff; ++i) { 828 *writeDiff(i, level)=0; 829 cmp=strcmp((const char *)prev, (const char *)level); 830 if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) { 831 log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n", 832 level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i); 833 } 834 if(cmp<0) { 835 if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) { 836 /* 837 * if the result is good, then print only if the length changed 838 * to get little but interesting output 839 */ 840 printBytes(prev, buf1); 841 printBytes(level, buf2); 842 log_verbose("ok: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2); 843 } 844 } else { 845 ++countErrors; 846 printBytes(prev, buf1); 847 printBytes(level, buf2); 848 log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2); 849 } 850 /* remember the previous bytes */ 851 memcpy(prev, level, 4); 852 } 853 854 /* show last number and bytes */ 855 printBytes((uint8_t *)"", buf1); 856 printBytes(prev, buf2); 857 log_verbose(" wD(%8ld) %s%s\n", i-1, buf1, buf2); 858 859 if(countErrors==0) { 860 log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n"); 861 } else { 862 log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors); 863 } 864 865 /* output signature byte sequence */ 866 i=0; 867 writePacked(encodeBocu1(&i, 0xfeff), level); 868 log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n", 869 level[0], level[1], level[2]); 870 } 871 872 /* cintltst code ------------------------------------------------------------ */ 873 874 static const int32_t DEFAULT_BUFFER_SIZE = 30000; 875 876 877 /* test one string with the ICU and the reference BOCU-1 implementations */ 878 static void 879 roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) { 880 UChar *roundtripRef, *roundtripICU; 881 char *bocu1Ref, *bocu1ICU; 882 883 int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength; 884 UErrorCode errorCode; 885 886 roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar)); 887 roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar)); 888 bocu1Ref = malloc(DEFAULT_BUFFER_SIZE); 889 bocu1ICU = malloc(DEFAULT_BUFFER_SIZE); 890 891 /* Unicode -> BOCU-1 */ 892 bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref); 893 894 errorCode=U_ZERO_ERROR; 895 bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, length, &errorCode); 896 if(U_FAILURE(errorCode)) { 897 log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode)); 898 return; 899 } 900 901 if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) { 902 log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength); 903 return; 904 } 905 906 /* BOCU-1 -> Unicode */ 907 roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef); 908 if(roundtripRefLength<0) { 909 free(roundtripICU); 910 return; /* readString() found an error and reported it */ 911 } 912 913 roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, bocu1ICU, bocu1ICULength, &errorCode); 914 if(U_FAILURE(errorCode)) { 915 log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode)); 916 return; 917 } 918 919 if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) { 920 log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength); 921 return; 922 } 923 if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) { 924 log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength); 925 return; 926 } 927 free(roundtripRef); 928 free(roundtripICU); 929 free(bocu1Ref); 930 free(bocu1ICU); 931 } 932 933 static const UChar feff[]={ 0xfeff }; 934 static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 }; 935 static const UChar crlf[]={ 0xd, 0xa, 0x20 }; 936 static const UChar nul[]={ 0 }; 937 static const UChar latin[]={ 0xdf, 0xe6 }; 938 static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 }; 939 static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 }; 940 static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 }; 941 static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 }; 942 static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */ 943 static const UChar plane1[]={ 0xd800, 0xdc00 }; 944 static const UChar plane2[]={ 0xd845, 0xdddd }; 945 static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 }; 946 static const UChar plane16[]={ 0xdbff, 0xdfff }; 947 static const UChar c0[]={ 1, 0xe40, 0x20, 9 }; 948 949 static const struct { 950 const UChar *s; 951 int32_t length; 952 } strings[]={ 953 { feff, LENGTHOF(feff) }, 954 { ascii, LENGTHOF(ascii) }, 955 { crlf, LENGTHOF(crlf) }, 956 { nul, LENGTHOF(nul) }, 957 { latin, LENGTHOF(latin) }, 958 { devanagari, LENGTHOF(devanagari) }, 959 { hiragana, LENGTHOF(hiragana) }, 960 { unihan, LENGTHOF(unihan) }, 961 { hangul, LENGTHOF(hangul) }, 962 { surrogates, LENGTHOF(surrogates) }, 963 { plane1, LENGTHOF(plane1) }, 964 { plane2, LENGTHOF(plane2) }, 965 { plane15, LENGTHOF(plane15) }, 966 { plane16, LENGTHOF(plane16) }, 967 { c0, LENGTHOF(c0) } 968 }; 969 970 /* 971 * Verify that the ICU BOCU-1 implementation produces the same results as 972 * the reference implementation from the design folder. 973 * Generate some texts and convert them with both converters, verifying 974 * identical results and roundtripping. 975 */ 976 static void 977 TestBOCU1(void) { 978 UChar *text; 979 int32_t i, length; 980 981 UConverter *bocu1; 982 UErrorCode errorCode; 983 984 errorCode=U_ZERO_ERROR; 985 bocu1=ucnv_open("BOCU-1", &errorCode); 986 if(U_FAILURE(errorCode)) { 987 log_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode)); 988 return; 989 } 990 991 text = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar)); 992 993 /* text 1: each of strings[] once */ 994 length=0; 995 for(i=0; i<LENGTHOF(strings); ++i) { 996 u_memcpy(text+length, strings[i].s, strings[i].length); 997 length+=strings[i].length; 998 } 999 roundtripBOCU1(bocu1, 1, text, length); 1000 1001 /* text 2: each of strings[] twice */ 1002 length=0; 1003 for(i=0; i<LENGTHOF(strings); ++i) { 1004 u_memcpy(text+length, strings[i].s, strings[i].length); 1005 length+=strings[i].length; 1006 u_memcpy(text+length, strings[i].s, strings[i].length); 1007 length+=strings[i].length; 1008 } 1009 roundtripBOCU1(bocu1, 2, text, length); 1010 1011 /* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */ 1012 length=0; 1013 for(i=1; length<5000; i+=7) { 1014 if(i>=LENGTHOF(strings)) { 1015 i-=LENGTHOF(strings); 1016 } 1017 u_memcpy(text+length, strings[i].s, strings[i].length); 1018 length+=strings[i].length; 1019 } 1020 roundtripBOCU1(bocu1, 3, text, length); 1021 1022 ucnv_close(bocu1); 1023 free(text); 1024 } 1025 1026 U_CFUNC void addBOCU1Tests(TestNode** root); 1027 1028 U_CFUNC void 1029 addBOCU1Tests(TestNode** root) { 1030 addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff"); 1031 addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1"); 1032 } 1033