1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2002-2011, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: bocu1tst.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002may27 14 * created by: Markus W. Scherer 15 * 16 * This is the reference implementation of BOCU-1, 17 * the MIME-friendly form of the Binary Ordered Compression for Unicode, 18 * taken directly from ### http://source.icu-project.org/repos/icu/icuhtml/trunk/design/conversion/bocu1/ 19 * The files bocu1.h and bocu1.c from the design folder are taken 20 * verbatim (minus copyright and #include) and copied together into this file. 21 * The reference code and some of the reference bocu1tst.c 22 * is modified to run as part of the ICU cintltst 23 * test framework (minus main(), log_ln() etc. instead of printf()). 24 * 25 * This reference implementation is used here to verify 26 * the ICU BOCU-1 implementation, which is 27 * adapted for ICU conversion APIs and optimized. 28 * ### links in design doc to here and to ucnvbocu.c 29 */ 30 31 #include "unicode/utypes.h" 32 #include "unicode/ustring.h" 33 #include "unicode/ucnv.h" 34 #include "unicode/utf16.h" 35 #include "cmemory.h" 36 #include "cintltst.h" 37 38 #define LENGTHOF(array) (sizeof(array)/sizeof((array)[0])) 39 40 /* icuhtml/design/conversion/bocu1/bocu1.h ---------------------------------- */ 41 42 /* BOCU-1 constants and macros ---------------------------------------------- */ 43 44 /* 45 * BOCU-1 encodes the code points of a Unicode string as 46 * a sequence of byte-encoded differences (slope detection), 47 * preserving lexical order. 48 * 49 * Optimize the difference-taking for runs of Unicode text within 50 * small scripts: 51 * 52 * Most small scripts are allocated within aligned 128-blocks of Unicode 53 * code points. Lexical order is preserved if the "previous code point" state 54 * is always moved into the middle of such a block. 55 * 56 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul 57 * areas into the middle of those areas. 58 * 59 * C0 control codes and space are encoded with their US-ASCII bytes. 60 * "prev" is reset for C0 controls but not for space. 61 */ 62 63 /* initial value for "prev": middle of the ASCII range */ 64 #define BOCU1_ASCII_PREV 0x40 65 66 /* bounding byte values for differences */ 67 #define BOCU1_MIN 0x21 68 #define BOCU1_MIDDLE 0x90 69 #define BOCU1_MAX_LEAD 0xfe 70 71 /* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */ 72 #define BOCU1_MAX_TRAIL 0xffL 73 #define BOCU1_RESET 0xff 74 75 /* number of lead bytes */ 76 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) 77 78 /* adjust trail byte counts for the use of some C0 control byte values */ 79 #define BOCU1_TRAIL_CONTROLS_COUNT 20 80 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) 81 82 /* number of trail bytes */ 83 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) 84 85 /* 86 * number of positive and negative single-byte codes 87 * (counting 0==BOCU1_MIDDLE among the positive ones) 88 */ 89 #define BOCU1_SINGLE 64 90 91 /* number of lead bytes for positive and negative 2/3/4-byte sequences */ 92 #define BOCU1_LEAD_2 43 93 #define BOCU1_LEAD_3 3 94 #define BOCU1_LEAD_4 1 95 96 /* The difference value range for single-byters. */ 97 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) 98 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) 99 100 /* The difference value range for double-byters. */ 101 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) 102 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) 103 104 /* The difference value range for 3-byters. */ 105 #define BOCU1_REACH_POS_3 \ 106 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) 107 108 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) 109 110 /* The lead byte start values. */ 111 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) 112 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) 113 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) 114 /* ==BOCU1_MAX_LEAD */ 115 116 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) 117 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) 118 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) 119 /* ==BOCU1_MIN+1 */ 120 121 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ 122 #define BOCU1_LENGTH_FROM_LEAD(lead) \ 123 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \ 124 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \ 125 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4) 126 127 /* The length of a byte sequence, according to its packed form. */ 128 #define BOCU1_LENGTH_FROM_PACKED(packed) \ 129 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4) 130 131 /* 132 * 12 commonly used C0 control codes (and space) are only used to encode 133 * themselves directly, 134 * which makes BOCU-1 MIME-usable and reasonably safe for 135 * ASCII-oriented software. 136 * 137 * These controls are 138 * 0 NUL 139 * 140 * 7 BEL 141 * 8 BS 142 * 143 * 9 TAB 144 * a LF 145 * b VT 146 * c FF 147 * d CR 148 * 149 * e SO 150 * f SI 151 * 152 * 1a SUB 153 * 1b ESC 154 * 155 * The other 20 C0 controls are also encoded directly (to preserve order) 156 * but are also used as trail bytes in difference encoding 157 * (for better compression). 158 */ 159 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) 160 161 /* 162 * Byte value map for control codes, 163 * from external byte values 0x00..0x20 164 * to trail byte values 0..19 (0..0x13) as used in the difference calculation. 165 * External byte values that are illegal as trail bytes are mapped to -1. 166 */ 167 static const int8_t 168 bocu1ByteToTrail[BOCU1_MIN]={ 169 /* 0 1 2 3 4 5 6 7 */ 170 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, 171 172 /* 8 9 a b c d e f */ 173 -1, -1, -1, -1, -1, -1, -1, -1, 174 175 /* 10 11 12 13 14 15 16 17 */ 176 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 177 178 /* 18 19 1a 1b 1c 1d 1e 1f */ 179 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, 180 181 /* 20 */ 182 -1 183 }; 184 185 /* 186 * Byte value map for control codes, 187 * from trail byte values 0..19 (0..0x13) as used in the difference calculation 188 * to external byte values 0x00..0x20. 189 */ 190 static const int8_t 191 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ 192 /* 0 1 2 3 4 5 6 7 */ 193 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, 194 195 /* 8 9 a b c d e f */ 196 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 197 198 /* 10 11 12 13 */ 199 0x1c, 0x1d, 0x1e, 0x1f 200 }; 201 202 /** 203 * Integer division and modulo with negative numerators 204 * yields negative modulo results and quotients that are one more than 205 * what we need here. 206 * This macro adjust the results so that the modulo-value m is always >=0. 207 * 208 * For positive n, the if() condition is always FALSE. 209 * 210 * @param n Number to be split into quotient and rest. 211 * Will be modified to contain the quotient. 212 * @param d Divisor. 213 * @param m Output variable for the rest (modulo result). 214 */ 215 #define NEGDIVMOD(n, d, m) { \ 216 (m)=(n)%(d); \ 217 (n)/=(d); \ 218 if((m)<0) { \ 219 --(n); \ 220 (m)+=(d); \ 221 } \ 222 } 223 224 /* State for BOCU-1 decoder function. */ 225 struct Bocu1Rx { 226 int32_t prev, count, diff; 227 }; 228 229 typedef struct Bocu1Rx Bocu1Rx; 230 231 /* Function prototypes ------------------------------------------------------ */ 232 233 /* see bocu1.c */ 234 U_CFUNC int32_t 235 packDiff(int32_t diff); 236 237 U_CFUNC int32_t 238 encodeBocu1(int32_t *pPrev, int32_t c); 239 240 U_CFUNC int32_t 241 decodeBocu1(Bocu1Rx *pRx, uint8_t b); 242 243 /* icuhtml/design/conversion/bocu1/bocu1.c ---------------------------------- */ 244 245 /* BOCU-1 implementation functions ------------------------------------------ */ 246 247 /** 248 * Compute the next "previous" value for differencing 249 * from the current code point. 250 * 251 * @param c current code point, 0..0x10ffff 252 * @return "previous code point" state value 253 */ 254 static int32_t 255 bocu1Prev(int32_t c) { 256 /* compute new prev */ 257 if(0x3040<=c && c<=0x309f) { 258 /* Hiragana is not 128-aligned */ 259 return 0x3070; 260 } else if(0x4e00<=c && c<=0x9fa5) { 261 /* CJK Unihan */ 262 return 0x4e00-BOCU1_REACH_NEG_2; 263 } else if(0xac00<=c && c<=0xd7a3) { 264 /* Korean Hangul (cast to int32_t to avoid wraparound on 16-bit compilers) */ 265 return ((int32_t)0xd7a3+(int32_t)0xac00)/2; 266 } else { 267 /* mostly small scripts */ 268 return (c&~0x7f)+BOCU1_ASCII_PREV; 269 } 270 } 271 272 /** 273 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes 274 * and return a packed integer with them. 275 * 276 * The encoding favors small absolut differences with short encodings 277 * to compress runs of same-script characters. 278 * 279 * @param diff difference value -0x10ffff..0x10ffff 280 * @return 281 * 0x010000zz for 1-byte sequence zz 282 * 0x0200yyzz for 2-byte sequence yy zz 283 * 0x03xxyyzz for 3-byte sequence xx yy zz 284 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) 285 */ 286 U_CFUNC int32_t 287 packDiff(int32_t diff) { 288 int32_t result, m, lead, count, shift; 289 290 if(diff>=BOCU1_REACH_NEG_1) { 291 /* mostly positive differences, and single-byte negative ones */ 292 if(diff<=BOCU1_REACH_POS_1) { 293 /* single byte */ 294 return 0x01000000|(BOCU1_MIDDLE+diff); 295 } else if(diff<=BOCU1_REACH_POS_2) { 296 /* two bytes */ 297 diff-=BOCU1_REACH_POS_1+1; 298 lead=BOCU1_START_POS_2; 299 count=1; 300 } else if(diff<=BOCU1_REACH_POS_3) { 301 /* three bytes */ 302 diff-=BOCU1_REACH_POS_2+1; 303 lead=BOCU1_START_POS_3; 304 count=2; 305 } else { 306 /* four bytes */ 307 diff-=BOCU1_REACH_POS_3+1; 308 lead=BOCU1_START_POS_4; 309 count=3; 310 } 311 } else { 312 /* two- and four-byte negative differences */ 313 if(diff>=BOCU1_REACH_NEG_2) { 314 /* two bytes */ 315 diff-=BOCU1_REACH_NEG_1; 316 lead=BOCU1_START_NEG_2; 317 count=1; 318 } else if(diff>=BOCU1_REACH_NEG_3) { 319 /* three bytes */ 320 diff-=BOCU1_REACH_NEG_2; 321 lead=BOCU1_START_NEG_3; 322 count=2; 323 } else { 324 /* four bytes */ 325 diff-=BOCU1_REACH_NEG_3; 326 lead=BOCU1_START_NEG_4; 327 count=3; 328 } 329 } 330 331 /* encode the length of the packed result */ 332 if(count<3) { 333 result=(count+1)<<24; 334 } else /* count==3, MSB used for the lead byte */ { 335 result=0; 336 } 337 338 /* calculate trail bytes like digits in itoa() */ 339 shift=0; 340 do { 341 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 342 result|=BOCU1_TRAIL_TO_BYTE(m)<<shift; 343 shift+=8; 344 } while(--count>0); 345 346 /* add lead byte */ 347 result|=(lead+diff)<<shift; 348 349 return result; 350 } 351 352 /** 353 * BOCU-1 encoder function. 354 * 355 * @param pPrev pointer to the integer that holds 356 * the "previous code point" state; 357 * the initial value should be 0 which 358 * encodeBocu1 will set to the actual BOCU-1 initial state value 359 * @param c the code point to encode 360 * @return the packed 1/2/3/4-byte encoding, see packDiff(), 361 * or 0 if an error occurs 362 * 363 * @see packDiff 364 */ 365 U_CFUNC int32_t 366 encodeBocu1(int32_t *pPrev, int32_t c) { 367 int32_t prev; 368 369 if(pPrev==NULL || c<0 || c>0x10ffff) { 370 /* illegal argument */ 371 return 0; 372 } 373 374 prev=*pPrev; 375 if(prev==0) { 376 /* lenient handling of initial value 0 */ 377 prev=*pPrev=BOCU1_ASCII_PREV; 378 } 379 380 if(c<=0x20) { 381 /* 382 * ISO C0 control & space: 383 * Encode directly for MIME compatibility, 384 * and reset state except for space, to not disrupt compression. 385 */ 386 if(c!=0x20) { 387 *pPrev=BOCU1_ASCII_PREV; 388 } 389 return 0x01000000|c; 390 } 391 392 /* 393 * all other Unicode code points c==U+0021..U+10ffff 394 * are encoded with the difference c-prev 395 * 396 * a new prev is computed from c, 397 * placed in the middle of a 0x80-block (for most small scripts) or 398 * in the middle of the Unihan and Hangul blocks 399 * to statistically minimize the following difference 400 */ 401 *pPrev=bocu1Prev(c); 402 return packDiff(c-prev); 403 } 404 405 /** 406 * Function for BOCU-1 decoder; handles multi-byte lead bytes. 407 * 408 * @param pRx pointer to the decoder state structure 409 * @param b lead byte; 410 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<=BOCU1_MAX_LEAD 411 * @return -1 (state change only) 412 * 413 * @see decodeBocu1 414 */ 415 static int32_t 416 decodeBocu1LeadByte(Bocu1Rx *pRx, uint8_t b) { 417 int32_t c, count; 418 419 if(b>=BOCU1_START_NEG_2) { 420 /* positive difference */ 421 if(b<BOCU1_START_POS_3) { 422 /* two bytes */ 423 c=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 424 count=1; 425 } else if(b<BOCU1_START_POS_4) { 426 /* three bytes */ 427 c=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1; 428 count=2; 429 } else { 430 /* four bytes */ 431 c=BOCU1_REACH_POS_3+1; 432 count=3; 433 } 434 } else { 435 /* negative difference */ 436 if(b>=BOCU1_START_NEG_3) { 437 /* two bytes */ 438 c=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 439 count=1; 440 } else if(b>BOCU1_MIN) { 441 /* three bytes */ 442 c=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2; 443 count=2; 444 } else { 445 /* four bytes */ 446 c=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; 447 count=3; 448 } 449 } 450 451 /* set the state for decoding the trail byte(s) */ 452 pRx->diff=c; 453 pRx->count=count; 454 return -1; 455 } 456 457 /** 458 * Function for BOCU-1 decoder; handles multi-byte trail bytes. 459 * 460 * @param pRx pointer to the decoder state structure 461 * @param b trail byte 462 * @return result value, same as decodeBocu1 463 * 464 * @see decodeBocu1 465 */ 466 static int32_t 467 decodeBocu1TrailByte(Bocu1Rx *pRx, uint8_t b) { 468 int32_t t, c, count; 469 470 if(b<=0x20) { 471 /* skip some C0 controls and make the trail byte range contiguous */ 472 t=bocu1ByteToTrail[b]; 473 if(t<0) { 474 /* illegal trail byte value */ 475 pRx->prev=BOCU1_ASCII_PREV; 476 pRx->count=0; 477 return -99; 478 } 479 #if BOCU1_MAX_TRAIL<0xff 480 } else if(b>BOCU1_MAX_TRAIL) { 481 return -99; 482 #endif 483 } else { 484 t=(int32_t)b-BOCU1_TRAIL_BYTE_OFFSET; 485 } 486 487 /* add trail byte into difference and decrement count */ 488 c=pRx->diff; 489 count=pRx->count; 490 491 if(count==1) { 492 /* final trail byte, deliver a code point */ 493 c=pRx->prev+c+t; 494 if(0<=c && c<=0x10ffff) { 495 /* valid code point result */ 496 pRx->prev=bocu1Prev(c); 497 pRx->count=0; 498 return c; 499 } else { 500 /* illegal code point result */ 501 pRx->prev=BOCU1_ASCII_PREV; 502 pRx->count=0; 503 return -99; 504 } 505 } 506 507 /* intermediate trail byte */ 508 if(count==2) { 509 pRx->diff=c+t*BOCU1_TRAIL_COUNT; 510 } else /* count==3 */ { 511 pRx->diff=c+t*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT; 512 } 513 pRx->count=count-1; 514 return -1; 515 } 516 517 /** 518 * BOCU-1 decoder function. 519 * 520 * @param pRx pointer to the decoder state structure; 521 * the initial values should be 0 which 522 * decodeBocu1 will set to actual initial state values 523 * @param b an input byte 524 * @return 525 * 0..0x10ffff for a result code point 526 * -1 if only the state changed without code point output 527 * <-1 if an error occurs 528 */ 529 U_CFUNC int32_t 530 decodeBocu1(Bocu1Rx *pRx, uint8_t b) { 531 int32_t prev, c, count; 532 533 if(pRx==NULL) { 534 /* illegal argument */ 535 return -99; 536 } 537 538 prev=pRx->prev; 539 if(prev==0) { 540 /* lenient handling of initial 0 values */ 541 prev=pRx->prev=BOCU1_ASCII_PREV; 542 count=pRx->count=0; 543 } else { 544 count=pRx->count; 545 } 546 547 if(count==0) { 548 /* byte in lead position */ 549 if(b<=0x20) { 550 /* 551 * Direct-encoded C0 control code or space. 552 * Reset prev for C0 control codes but not for space. 553 */ 554 if(b!=0x20) { 555 pRx->prev=BOCU1_ASCII_PREV; 556 } 557 return b; 558 } 559 560 /* 561 * b is a difference lead byte. 562 * 563 * Return a code point directly from a single-byte difference. 564 * 565 * For multi-byte difference lead bytes, set the decoder state 566 * with the partial difference value from the lead byte and 567 * with the number of trail bytes. 568 * 569 * For four-byte differences, the signedness also affects the 570 * first trail byte, which has special handling farther below. 571 */ 572 if(b>=BOCU1_START_NEG_2 && b<BOCU1_START_POS_2) { 573 /* single-byte difference */ 574 c=prev+((int32_t)b-BOCU1_MIDDLE); 575 pRx->prev=bocu1Prev(c); 576 return c; 577 } else if(b==BOCU1_RESET) { 578 /* only reset the state, no code point */ 579 pRx->prev=BOCU1_ASCII_PREV; 580 return -1; 581 } else { 582 return decodeBocu1LeadByte(pRx, b); 583 } 584 } else { 585 /* trail byte in any position */ 586 return decodeBocu1TrailByte(pRx, b); 587 } 588 } 589 590 /* icuhtml/design/conversion/bocu1/bocu1tst.c ------------------------------- */ 591 592 /* test code ---------------------------------------------------------------- */ 593 594 /* test code options */ 595 596 /* ignore comma when processing name lists in testText() */ 597 #define TEST_IGNORE_COMMA 1 598 599 /** 600 * Write a packed BOCU-1 byte sequence into a byte array, 601 * without overflow check. 602 * Test function. 603 * 604 * @param packed packed BOCU-1 byte sequence, see packDiff() 605 * @param p pointer to byte array 606 * @return number of bytes 607 * 608 * @see packDiff 609 */ 610 static int32_t 611 writePacked(int32_t packed, uint8_t *p) { 612 int32_t count=BOCU1_LENGTH_FROM_PACKED(packed); 613 switch(count) { 614 case 4: 615 *p++=(uint8_t)(packed>>24); 616 case 3: 617 *p++=(uint8_t)(packed>>16); 618 case 2: 619 *p++=(uint8_t)(packed>>8); 620 case 1: 621 *p++=(uint8_t)packed; 622 default: 623 break; 624 } 625 626 return count; 627 } 628 629 /** 630 * Unpack a packed BOCU-1 non-C0/space byte sequence and get 631 * the difference to initialPrev. 632 * Used only for round-trip testing of the difference encoding and decoding. 633 * Test function. 634 * 635 * @param initialPrev bogus "previous code point" value to make sure that 636 * the resulting code point is in the range 0..0x10ffff 637 * @param packed packed BOCU-1 byte sequence 638 * @return the difference to initialPrev 639 * 640 * @see packDiff 641 * @see writeDiff 642 */ 643 static int32_t 644 unpackDiff(int32_t initialPrev, int32_t packed) { 645 Bocu1Rx rx={ 0, 0, 0 }; 646 int32_t count; 647 648 rx.prev=initialPrev; 649 count=BOCU1_LENGTH_FROM_PACKED(packed); 650 switch(count) { 651 case 4: 652 decodeBocu1(&rx, (uint8_t)(packed>>24)); 653 case 3: 654 decodeBocu1(&rx, (uint8_t)(packed>>16)); 655 case 2: 656 decodeBocu1(&rx, (uint8_t)(packed>>8)); 657 case 1: 658 /* subtract initial prev */ 659 return decodeBocu1(&rx, (uint8_t)packed)-initialPrev; 660 default: 661 return -0x7fffffff; 662 } 663 } 664 665 /** 666 * Encode one difference value -0x10ffff..+0x10ffff in 1..4 bytes, 667 * preserving lexical order. 668 * Also checks for roundtripping of the difference encoding. 669 * Test function. 670 * 671 * @param diff difference value to test, -0x10ffff..0x10ffff 672 * @param p pointer to output byte array 673 * @return p advanced by number of bytes output 674 * 675 * @see unpackDiff 676 */ 677 static uint8_t * 678 writeDiff(int32_t diff, uint8_t *p) { 679 /* generate the difference as a packed value and serialize it */ 680 int32_t packed, initialPrev; 681 682 packed=packDiff(diff); 683 684 /* 685 * bogus initial "prev" to work around 686 * code point range check in decodeBocu1() 687 */ 688 if(diff<=0) { 689 initialPrev=0x10ffff; 690 } else { 691 initialPrev=-1; 692 } 693 694 if(diff!=unpackDiff(initialPrev, packed)) { 695 log_err("error: unpackDiff(packDiff(diff=%ld)=0x%08lx)=%ld!=diff\n", 696 diff, packed, unpackDiff(initialPrev, packed)); 697 } 698 return p+writePacked(packed, p); 699 } 700 701 /** 702 * Encode a UTF-16 string in BOCU-1. 703 * Does not check for overflows, but otherwise useful function. 704 * 705 * @param s input UTF-16 string 706 * @param length number of UChar code units in s 707 * @param p pointer to output byte array 708 * @return number of bytes output 709 */ 710 static int32_t 711 writeString(const UChar *s, int32_t length, uint8_t *p) { 712 uint8_t *p0; 713 int32_t c, prev, i; 714 715 prev=0; 716 p0=p; 717 i=0; 718 while(i<length) { 719 U16_NEXT(s, i, length, c); 720 p+=writePacked(encodeBocu1(&prev, c), p); 721 } 722 return (int32_t)(p-p0); 723 } 724 725 /** 726 * Decode a BOCU-1 byte sequence to a UTF-16 string. 727 * Does not check for overflows, but otherwise useful function. 728 * 729 * @param p pointer to input BOCU-1 bytes 730 * @param length number of input bytes 731 * @param s point to output UTF-16 string array 732 * @return number of UChar code units output 733 */ 734 static int32_t 735 readString(const uint8_t *p, int32_t length, UChar *s) { 736 Bocu1Rx rx={ 0, 0, 0 }; 737 int32_t c, i, sLength; 738 739 i=sLength=0; 740 while(i<length) { 741 c=decodeBocu1(&rx, p[i++]); 742 if(c<-1) { 743 log_err("error: readString detects encoding error at string index %ld\n", i); 744 return -1; 745 } 746 if(c>=0) { 747 U16_APPEND_UNSAFE(s, sLength, c); 748 } 749 } 750 return sLength; 751 } 752 753 static char 754 hexDigit(uint8_t digit) { 755 return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit); 756 } 757 758 /** 759 * Pretty-print 0-terminated byte values. 760 * Helper function for test output. 761 * 762 * @param bytes 0-terminated byte array to print 763 */ 764 static void 765 printBytes(uint8_t *bytes, char *out) { 766 int i; 767 uint8_t b; 768 769 i=0; 770 while((b=*bytes++)!=0) { 771 *out++=' '; 772 *out++=hexDigit((uint8_t)(b>>4)); 773 *out++=hexDigit((uint8_t)(b&0xf)); 774 ++i; 775 } 776 i=3*(5-i); 777 while(i>0) { 778 *out++=' '; 779 --i; 780 } 781 *out=0; 782 } 783 784 /** 785 * Basic BOCU-1 test function, called when there are no command line arguments. 786 * Prints some of the #define values and performs round-trip tests of the 787 * difference encoding and decoding. 788 */ 789 static void 790 TestBOCU1RefDiff(void) { 791 char buf1[80], buf2[80]; 792 uint8_t prev[5], level[5]; 793 int32_t i, cmp, countErrors; 794 795 log_verbose("reach of single bytes: %ld\n", 1+BOCU1_REACH_POS_1-BOCU1_REACH_NEG_1); 796 log_verbose("reach of 2 bytes : %ld\n", 1+BOCU1_REACH_POS_2-BOCU1_REACH_NEG_2); 797 log_verbose("reach of 3 bytes : %ld\n\n", 1+BOCU1_REACH_POS_3-BOCU1_REACH_NEG_3); 798 799 log_verbose(" BOCU1_REACH_NEG_1 %8ld BOCU1_REACH_POS_1 %8ld\n", BOCU1_REACH_NEG_1, BOCU1_REACH_POS_1); 800 log_verbose(" BOCU1_REACH_NEG_2 %8ld BOCU1_REACH_POS_2 %8ld\n", BOCU1_REACH_NEG_2, BOCU1_REACH_POS_2); 801 log_verbose(" BOCU1_REACH_NEG_3 %8ld BOCU1_REACH_POS_3 %8ld\n\n", BOCU1_REACH_NEG_3, BOCU1_REACH_POS_3); 802 803 log_verbose(" BOCU1_MIDDLE 0x%02x\n", BOCU1_MIDDLE); 804 log_verbose(" BOCU1_START_NEG_2 0x%02x BOCU1_START_POS_2 0x%02x\n", BOCU1_START_NEG_2, BOCU1_START_POS_2); 805 log_verbose(" BOCU1_START_NEG_3 0x%02x BOCU1_START_POS_3 0x%02x\n\n", BOCU1_START_NEG_3, BOCU1_START_POS_3); 806 807 /* test packDiff() & unpackDiff() with some specific values */ 808 writeDiff(0, level); 809 writeDiff(1, level); 810 writeDiff(65, level); 811 writeDiff(130, level); 812 writeDiff(30000, level); 813 writeDiff(1000000, level); 814 writeDiff(-65, level); 815 writeDiff(-130, level); 816 writeDiff(-30000, level); 817 writeDiff(-1000000, level); 818 819 /* test that each value is smaller than any following one */ 820 countErrors=0; 821 i=-0x10ffff; 822 *writeDiff(i, prev)=0; 823 824 /* show first number and bytes */ 825 printBytes(prev, buf1); 826 log_verbose(" wD(%8ld) %s\n", i, buf1); 827 828 for(++i; i<=0x10ffff; ++i) { 829 *writeDiff(i, level)=0; 830 cmp=strcmp((const char *)prev, (const char *)level); 831 if(BOCU1_LENGTH_FROM_LEAD(level[0])!=(int32_t)strlen((const char *)level)) { 832 log_verbose("BOCU1_LENGTH_FROM_LEAD(0x%02x)=%ld!=%ld=strlen(writeDiff(%ld))\n", 833 level[0], BOCU1_LENGTH_FROM_LEAD(level[0]), strlen((const char *)level), i); 834 } 835 if(cmp<0) { 836 if(i==0 || i==1 || strlen((const char *)prev)!=strlen((const char *)level)) { 837 /* 838 * if the result is good, then print only if the length changed 839 * to get little but interesting output 840 */ 841 printBytes(prev, buf1); 842 printBytes(level, buf2); 843 log_verbose("ok: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2); 844 } 845 } else { 846 ++countErrors; 847 printBytes(prev, buf1); 848 printBytes(level, buf2); 849 log_verbose("wrong: strcmp(wD(%8ld), wD(%8ld))=%2d %s%s\n", i-1, i, cmp, buf1, buf2); 850 } 851 /* remember the previous bytes */ 852 memcpy(prev, level, 4); 853 } 854 855 /* show last number and bytes */ 856 printBytes((uint8_t *)"", buf1); 857 printBytes(prev, buf2); 858 log_verbose(" wD(%8ld) %s%s\n", i-1, buf1, buf2); 859 860 if(countErrors==0) { 861 log_verbose("writeDiff(-0x10ffff..0x10ffff) works fine\n"); 862 } else { 863 log_err("writeDiff(-0x10ffff..0x10ffff) violates lexical ordering in %d cases\n", countErrors); 864 } 865 866 /* output signature byte sequence */ 867 i=0; 868 writePacked(encodeBocu1(&i, 0xfeff), level); 869 log_verbose("\nBOCU-1 signature byte sequence: %02x %02x %02x\n", 870 level[0], level[1], level[2]); 871 } 872 873 /* cintltst code ------------------------------------------------------------ */ 874 875 static const int32_t DEFAULT_BUFFER_SIZE = 30000; 876 877 878 /* test one string with the ICU and the reference BOCU-1 implementations */ 879 static void 880 roundtripBOCU1(UConverter *bocu1, int32_t number, const UChar *text, int32_t length) { 881 UChar *roundtripRef, *roundtripICU; 882 char *bocu1Ref, *bocu1ICU; 883 884 int32_t bocu1RefLength, bocu1ICULength, roundtripRefLength, roundtripICULength; 885 UErrorCode errorCode; 886 887 roundtripRef = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar)); 888 roundtripICU = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar)); 889 bocu1Ref = malloc(DEFAULT_BUFFER_SIZE); 890 bocu1ICU = malloc(DEFAULT_BUFFER_SIZE); 891 892 /* Unicode -> BOCU-1 */ 893 bocu1RefLength=writeString(text, length, (uint8_t *)bocu1Ref); 894 895 errorCode=U_ZERO_ERROR; 896 bocu1ICULength=ucnv_fromUChars(bocu1, bocu1ICU, DEFAULT_BUFFER_SIZE, text, length, &errorCode); 897 if(U_FAILURE(errorCode)) { 898 log_err("ucnv_fromUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode)); 899 goto cleanup; 900 } 901 902 if(bocu1RefLength!=bocu1ICULength || 0!=uprv_memcmp(bocu1Ref, bocu1ICU, bocu1RefLength)) { 903 log_err("Unicode(%d)[%d] -> BOCU-1: reference[%d]!=ICU[%d]\n", number, length, bocu1RefLength, bocu1ICULength); 904 goto cleanup; 905 } 906 907 /* BOCU-1 -> Unicode */ 908 roundtripRefLength=readString((uint8_t *)bocu1Ref, bocu1RefLength, roundtripRef); 909 if(roundtripRefLength<0) { 910 goto cleanup; /* readString() found an error and reported it */ 911 } 912 913 roundtripICULength=ucnv_toUChars(bocu1, roundtripICU, DEFAULT_BUFFER_SIZE, bocu1ICU, bocu1ICULength, &errorCode); 914 if(U_FAILURE(errorCode)) { 915 log_err("ucnv_toUChars(BOCU-1, text(%d)[%d]) failed: %s\n", number, length, u_errorName(errorCode)); 916 goto cleanup; 917 } 918 919 if(length!=roundtripRefLength || 0!=u_memcmp(text, roundtripRef, length)) { 920 log_err("BOCU-1 -> Unicode: original(%d)[%d]!=reference[%d]\n", number, length, roundtripRefLength); 921 goto cleanup; 922 } 923 if(roundtripRefLength!=roundtripICULength || 0!=u_memcmp(roundtripRef, roundtripICU, roundtripRefLength)) { 924 log_err("BOCU-1 -> Unicode: reference(%d)[%d]!=ICU[%d]\n", number, roundtripRefLength, roundtripICULength); 925 goto cleanup; 926 } 927 cleanup: 928 free(roundtripRef); 929 free(roundtripICU); 930 free(bocu1Ref); 931 free(bocu1ICU); 932 } 933 934 static const UChar feff[]={ 0xfeff }; 935 static const UChar ascii[]={ 0x61, 0x62, 0x20, 0x63, 0x61 }; 936 static const UChar crlf[]={ 0xd, 0xa, 0x20 }; 937 static const UChar nul[]={ 0 }; 938 static const UChar latin[]={ 0xdf, 0xe6 }; 939 static const UChar devanagari[]={ 0x930, 0x20, 0x918, 0x909 }; 940 static const UChar hiragana[]={ 0x3086, 0x304d, 0x20, 0x3053, 0x4000 }; 941 static const UChar unihan[]={ 0x4e00, 0x7777, 0x20, 0x9fa5, 0x4e00 }; 942 static const UChar hangul[]={ 0xac00, 0xbcde, 0x20, 0xd7a3 }; 943 static const UChar surrogates[]={ 0xdc00, 0xd800 }; /* single surrogates, unmatched! */ 944 static const UChar plane1[]={ 0xd800, 0xdc00 }; 945 static const UChar plane2[]={ 0xd845, 0xdddd }; 946 static const UChar plane15[]={ 0xdbbb, 0xddee, 0x20 }; 947 static const UChar plane16[]={ 0xdbff, 0xdfff }; 948 static const UChar c0[]={ 1, 0xe40, 0x20, 9 }; 949 950 static const struct { 951 const UChar *s; 952 int32_t length; 953 } strings[]={ 954 { feff, LENGTHOF(feff) }, 955 { ascii, LENGTHOF(ascii) }, 956 { crlf, LENGTHOF(crlf) }, 957 { nul, LENGTHOF(nul) }, 958 { latin, LENGTHOF(latin) }, 959 { devanagari, LENGTHOF(devanagari) }, 960 { hiragana, LENGTHOF(hiragana) }, 961 { unihan, LENGTHOF(unihan) }, 962 { hangul, LENGTHOF(hangul) }, 963 { surrogates, LENGTHOF(surrogates) }, 964 { plane1, LENGTHOF(plane1) }, 965 { plane2, LENGTHOF(plane2) }, 966 { plane15, LENGTHOF(plane15) }, 967 { plane16, LENGTHOF(plane16) }, 968 { c0, LENGTHOF(c0) } 969 }; 970 971 /* 972 * Verify that the ICU BOCU-1 implementation produces the same results as 973 * the reference implementation from the design folder. 974 * Generate some texts and convert them with both converters, verifying 975 * identical results and roundtripping. 976 */ 977 static void 978 TestBOCU1(void) { 979 UChar *text; 980 int32_t i, length; 981 982 UConverter *bocu1; 983 UErrorCode errorCode; 984 985 errorCode=U_ZERO_ERROR; 986 bocu1=ucnv_open("BOCU-1", &errorCode); 987 if(U_FAILURE(errorCode)) { 988 log_err("error: unable to open BOCU-1 converter: %s\n", u_errorName(errorCode)); 989 return; 990 } 991 992 text = malloc(DEFAULT_BUFFER_SIZE * sizeof(UChar)); 993 994 /* text 1: each of strings[] once */ 995 length=0; 996 for(i=0; i<LENGTHOF(strings); ++i) { 997 u_memcpy(text+length, strings[i].s, strings[i].length); 998 length+=strings[i].length; 999 } 1000 roundtripBOCU1(bocu1, 1, text, length); 1001 1002 /* text 2: each of strings[] twice */ 1003 length=0; 1004 for(i=0; i<LENGTHOF(strings); ++i) { 1005 u_memcpy(text+length, strings[i].s, strings[i].length); 1006 length+=strings[i].length; 1007 u_memcpy(text+length, strings[i].s, strings[i].length); 1008 length+=strings[i].length; 1009 } 1010 roundtripBOCU1(bocu1, 2, text, length); 1011 1012 /* text 3: each of strings[] many times (set step vs. |strings| so that all strings are used) */ 1013 length=0; 1014 for(i=1; length<5000; i+=7) { 1015 if(i>=LENGTHOF(strings)) { 1016 i-=LENGTHOF(strings); 1017 } 1018 u_memcpy(text+length, strings[i].s, strings[i].length); 1019 length+=strings[i].length; 1020 } 1021 roundtripBOCU1(bocu1, 3, text, length); 1022 1023 ucnv_close(bocu1); 1024 free(text); 1025 } 1026 1027 U_CFUNC void addBOCU1Tests(TestNode** root); 1028 1029 U_CFUNC void 1030 addBOCU1Tests(TestNode** root) { 1031 addTest(root, TestBOCU1RefDiff, "tsconv/bocu1tst/TestBOCU1RefDiff"); 1032 addTest(root, TestBOCU1, "tsconv/bocu1tst/TestBOCU1"); 1033 } 1034