1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2002-2005, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: ucnvbocu.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002mar27 14 * created by: Markus W. Scherer 15 * 16 * This is an implementation of the Binary Ordered Compression for Unicode, 17 * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/ 18 */ 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_CONVERSION 23 24 #include "unicode/ucnv.h" 25 #include "unicode/ucnv_cb.h" 26 #include "ucnv_bld.h" 27 #include "ucnv_cnv.h" 28 29 /* BOCU-1 constants and macros ---------------------------------------------- */ 30 31 /* 32 * BOCU-1 encodes the code points of a Unicode string as 33 * a sequence of byte-encoded differences (slope detection), 34 * preserving lexical order. 35 * 36 * Optimize the difference-taking for runs of Unicode text within 37 * small scripts: 38 * 39 * Most small scripts are allocated within aligned 128-blocks of Unicode 40 * code points. Lexical order is preserved if the "previous code point" state 41 * is always moved into the middle of such a block. 42 * 43 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul 44 * areas into the middle of those areas. 45 * 46 * C0 control codes and space are encoded with their US-ASCII bytes. 47 * "prev" is reset for C0 controls but not for space. 48 */ 49 50 /* initial value for "prev": middle of the ASCII range */ 51 #define BOCU1_ASCII_PREV 0x40 52 53 /* bounding byte values for differences */ 54 #define BOCU1_MIN 0x21 55 #define BOCU1_MIDDLE 0x90 56 #define BOCU1_MAX_LEAD 0xfe 57 #define BOCU1_MAX_TRAIL 0xff 58 #define BOCU1_RESET 0xff 59 60 /* number of lead bytes */ 61 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) 62 63 /* adjust trail byte counts for the use of some C0 control byte values */ 64 #define BOCU1_TRAIL_CONTROLS_COUNT 20 65 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) 66 67 /* number of trail bytes */ 68 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) 69 70 /* 71 * number of positive and negative single-byte codes 72 * (counting 0==BOCU1_MIDDLE among the positive ones) 73 */ 74 #define BOCU1_SINGLE 64 75 76 /* number of lead bytes for positive and negative 2/3/4-byte sequences */ 77 #define BOCU1_LEAD_2 43 78 #define BOCU1_LEAD_3 3 79 #define BOCU1_LEAD_4 1 80 81 /* The difference value range for single-byters. */ 82 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) 83 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) 84 85 /* The difference value range for double-byters. */ 86 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) 87 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) 88 89 /* The difference value range for 3-byters. */ 90 #define BOCU1_REACH_POS_3 \ 91 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) 92 93 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) 94 95 /* The lead byte start values. */ 96 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) 97 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) 98 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) 99 /* ==BOCU1_MAX_LEAD */ 100 101 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) 102 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) 103 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) 104 /* ==BOCU1_MIN+1 */ 105 106 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ 107 #define BOCU1_LENGTH_FROM_LEAD(lead) \ 108 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \ 109 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \ 110 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4) 111 112 /* The length of a byte sequence, according to its packed form. */ 113 #define BOCU1_LENGTH_FROM_PACKED(packed) \ 114 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4) 115 116 /* 117 * 12 commonly used C0 control codes (and space) are only used to encode 118 * themselves directly, 119 * which makes BOCU-1 MIME-usable and reasonably safe for 120 * ASCII-oriented software. 121 * 122 * These controls are 123 * 0 NUL 124 * 125 * 7 BEL 126 * 8 BS 127 * 128 * 9 TAB 129 * a LF 130 * b VT 131 * c FF 132 * d CR 133 * 134 * e SO 135 * f SI 136 * 137 * 1a SUB 138 * 1b ESC 139 * 140 * The other 20 C0 controls are also encoded directly (to preserve order) 141 * but are also used as trail bytes in difference encoding 142 * (for better compression). 143 */ 144 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) 145 146 /* 147 * Byte value map for control codes, 148 * from external byte values 0x00..0x20 149 * to trail byte values 0..19 (0..0x13) as used in the difference calculation. 150 * External byte values that are illegal as trail bytes are mapped to -1. 151 */ 152 static const int8_t 153 bocu1ByteToTrail[BOCU1_MIN]={ 154 /* 0 1 2 3 4 5 6 7 */ 155 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, 156 157 /* 8 9 a b c d e f */ 158 -1, -1, -1, -1, -1, -1, -1, -1, 159 160 /* 10 11 12 13 14 15 16 17 */ 161 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 162 163 /* 18 19 1a 1b 1c 1d 1e 1f */ 164 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, 165 166 /* 20 */ 167 -1 168 }; 169 170 /* 171 * Byte value map for control codes, 172 * from trail byte values 0..19 (0..0x13) as used in the difference calculation 173 * to external byte values 0x00..0x20. 174 */ 175 static const int8_t 176 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ 177 /* 0 1 2 3 4 5 6 7 */ 178 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, 179 180 /* 8 9 a b c d e f */ 181 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 182 183 /* 10 11 12 13 */ 184 0x1c, 0x1d, 0x1e, 0x1f 185 }; 186 187 /** 188 * Integer division and modulo with negative numerators 189 * yields negative modulo results and quotients that are one more than 190 * what we need here. 191 * This macro adjust the results so that the modulo-value m is always >=0. 192 * 193 * For positive n, the if() condition is always FALSE. 194 * 195 * @param n Number to be split into quotient and rest. 196 * Will be modified to contain the quotient. 197 * @param d Divisor. 198 * @param m Output variable for the rest (modulo result). 199 */ 200 #define NEGDIVMOD(n, d, m) { \ 201 (m)=(n)%(d); \ 202 (n)/=(d); \ 203 if((m)<0) { \ 204 --(n); \ 205 (m)+=(d); \ 206 } \ 207 } 208 209 /* BOCU-1 implementation functions ------------------------------------------ */ 210 211 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV) 212 213 /** 214 * Compute the next "previous" value for differencing 215 * from the current code point. 216 * 217 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below) 218 * @return "previous code point" state value 219 */ 220 static U_INLINE int32_t 221 bocu1Prev(int32_t c) { 222 /* compute new prev */ 223 if(/* 0x3040<=c && */ c<=0x309f) { 224 /* Hiragana is not 128-aligned */ 225 return 0x3070; 226 } else if(0x4e00<=c && c<=0x9fa5) { 227 /* CJK Unihan */ 228 return 0x4e00-BOCU1_REACH_NEG_2; 229 } else if(0xac00<=c /* && c<=0xd7a3 */) { 230 /* Korean Hangul */ 231 return (0xd7a3+0xac00)/2; 232 } else { 233 /* mostly small scripts */ 234 return BOCU1_SIMPLE_PREV(c); 235 } 236 } 237 238 /** Fast version of bocu1Prev() for most scripts. */ 239 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c)) 240 241 /* 242 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c. 243 * The UConverter fields are used as follows: 244 * 245 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) 246 * 247 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) 248 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0) 249 */ 250 251 /* BOCU-1-from-Unicode conversion functions --------------------------------- */ 252 253 /** 254 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes 255 * and return a packed integer with them. 256 * 257 * The encoding favors small absolut differences with short encodings 258 * to compress runs of same-script characters. 259 * 260 * Optimized version with unrolled loops and fewer floating-point operations 261 * than the standard packDiff(). 262 * 263 * @param diff difference value -0x10ffff..0x10ffff 264 * @return 265 * 0x010000zz for 1-byte sequence zz 266 * 0x0200yyzz for 2-byte sequence yy zz 267 * 0x03xxyyzz for 3-byte sequence xx yy zz 268 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) 269 */ 270 static int32_t 271 packDiff(int32_t diff) { 272 int32_t result, m; 273 274 if(diff>=BOCU1_REACH_NEG_1) { 275 /* mostly positive differences, and single-byte negative ones */ 276 #if 0 /* single-byte case handled in macros, see below */ 277 if(diff<=BOCU1_REACH_POS_1) { 278 /* single byte */ 279 return 0x01000000|(BOCU1_MIDDLE+diff); 280 } else 281 #endif 282 if(diff<=BOCU1_REACH_POS_2) { 283 /* two bytes */ 284 diff-=BOCU1_REACH_POS_1+1; 285 result=0x02000000; 286 287 m=diff%BOCU1_TRAIL_COUNT; 288 diff/=BOCU1_TRAIL_COUNT; 289 result|=BOCU1_TRAIL_TO_BYTE(m); 290 291 result|=(BOCU1_START_POS_2+diff)<<8; 292 } else if(diff<=BOCU1_REACH_POS_3) { 293 /* three bytes */ 294 diff-=BOCU1_REACH_POS_2+1; 295 result=0x03000000; 296 297 m=diff%BOCU1_TRAIL_COUNT; 298 diff/=BOCU1_TRAIL_COUNT; 299 result|=BOCU1_TRAIL_TO_BYTE(m); 300 301 m=diff%BOCU1_TRAIL_COUNT; 302 diff/=BOCU1_TRAIL_COUNT; 303 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 304 305 result|=(BOCU1_START_POS_3+diff)<<16; 306 } else { 307 /* four bytes */ 308 diff-=BOCU1_REACH_POS_3+1; 309 310 m=diff%BOCU1_TRAIL_COUNT; 311 diff/=BOCU1_TRAIL_COUNT; 312 result=BOCU1_TRAIL_TO_BYTE(m); 313 314 m=diff%BOCU1_TRAIL_COUNT; 315 diff/=BOCU1_TRAIL_COUNT; 316 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 317 318 /* 319 * We know that / and % would deliver quotient 0 and rest=diff. 320 * Avoid division and modulo for performance. 321 */ 322 result|=BOCU1_TRAIL_TO_BYTE(diff)<<16; 323 324 result|=((uint32_t)BOCU1_START_POS_4)<<24; 325 } 326 } else { 327 /* two- to four-byte negative differences */ 328 if(diff>=BOCU1_REACH_NEG_2) { 329 /* two bytes */ 330 diff-=BOCU1_REACH_NEG_1; 331 result=0x02000000; 332 333 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 334 result|=BOCU1_TRAIL_TO_BYTE(m); 335 336 result|=(BOCU1_START_NEG_2+diff)<<8; 337 } else if(diff>=BOCU1_REACH_NEG_3) { 338 /* three bytes */ 339 diff-=BOCU1_REACH_NEG_2; 340 result=0x03000000; 341 342 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 343 result|=BOCU1_TRAIL_TO_BYTE(m); 344 345 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 346 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 347 348 result|=(BOCU1_START_NEG_3+diff)<<16; 349 } else { 350 /* four bytes */ 351 diff-=BOCU1_REACH_NEG_3; 352 353 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 354 result=BOCU1_TRAIL_TO_BYTE(m); 355 356 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 357 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 358 359 /* 360 * We know that NEGDIVMOD would deliver 361 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT. 362 * Avoid division and modulo for performance. 363 */ 364 m=diff+BOCU1_TRAIL_COUNT; 365 result|=BOCU1_TRAIL_TO_BYTE(m)<<16; 366 367 result|=BOCU1_MIN<<24; 368 } 369 } 370 return result; 371 } 372 373 /* Faster versions of packDiff() for single-byte-encoded diff values. */ 374 375 /** Is a diff value encodable in a single byte? */ 376 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1) 377 378 /** Encode a diff value in a single byte. */ 379 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff)) 380 381 /** Is a diff value encodable in two bytes? */ 382 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2) 383 384 static void 385 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 386 UErrorCode *pErrorCode) { 387 UConverter *cnv; 388 const UChar *source, *sourceLimit; 389 uint8_t *target; 390 int32_t targetCapacity; 391 int32_t *offsets; 392 393 int32_t prev, c, diff; 394 395 int32_t sourceIndex, nextSourceIndex; 396 397 U_ALIGN_CODE(16) 398 399 /* set up the local pointers */ 400 cnv=pArgs->converter; 401 source=pArgs->source; 402 sourceLimit=pArgs->sourceLimit; 403 target=(uint8_t *)pArgs->target; 404 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 405 offsets=pArgs->offsets; 406 407 /* get the converter state from UConverter */ 408 c=cnv->fromUChar32; 409 prev=(int32_t)cnv->fromUnicodeStatus; 410 if(prev==0) { 411 prev=BOCU1_ASCII_PREV; 412 } 413 414 /* sourceIndex=-1 if the current character began in the previous buffer */ 415 sourceIndex= c==0 ? 0 : -1; 416 nextSourceIndex=0; 417 418 /* conversion loop */ 419 if(c!=0 && targetCapacity>0) { 420 goto getTrail; 421 } 422 423 fastSingle: 424 /* fast loop for single-byte differences */ 425 /* use only one loop counter variable, targetCapacity, not also source */ 426 diff=(int32_t)(sourceLimit-source); 427 if(targetCapacity>diff) { 428 targetCapacity=diff; 429 } 430 while(targetCapacity>0 && (c=*source)<0x3000) { 431 if(c<=0x20) { 432 if(c!=0x20) { 433 prev=BOCU1_ASCII_PREV; 434 } 435 *target++=(uint8_t)c; 436 *offsets++=nextSourceIndex++; 437 ++source; 438 --targetCapacity; 439 } else { 440 diff=c-prev; 441 if(DIFF_IS_SINGLE(diff)) { 442 prev=BOCU1_SIMPLE_PREV(c); 443 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 444 *offsets++=nextSourceIndex++; 445 ++source; 446 --targetCapacity; 447 } else { 448 break; 449 } 450 } 451 } 452 /* restore real values */ 453 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); 454 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ 455 456 /* regular loop for all cases */ 457 while(source<sourceLimit) { 458 if(targetCapacity>0) { 459 c=*source++; 460 ++nextSourceIndex; 461 462 if(c<=0x20) { 463 /* 464 * ISO C0 control & space: 465 * Encode directly for MIME compatibility, 466 * and reset state except for space, to not disrupt compression. 467 */ 468 if(c!=0x20) { 469 prev=BOCU1_ASCII_PREV; 470 } 471 *target++=(uint8_t)c; 472 *offsets++=sourceIndex; 473 --targetCapacity; 474 475 sourceIndex=nextSourceIndex; 476 continue; 477 } 478 479 if(UTF_IS_LEAD(c)) { 480 getTrail: 481 if(source<sourceLimit) { 482 /* test the following code unit */ 483 UChar trail=*source; 484 if(UTF_IS_SECOND_SURROGATE(trail)) { 485 ++source; 486 ++nextSourceIndex; 487 c=UTF16_GET_PAIR_VALUE(c, trail); 488 } 489 } else { 490 /* no more input */ 491 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ 492 break; 493 } 494 } 495 496 /* 497 * all other Unicode code points c==U+0021..U+10ffff 498 * are encoded with the difference c-prev 499 * 500 * a new prev is computed from c, 501 * placed in the middle of a 0x80-block (for most small scripts) or 502 * in the middle of the Unihan and Hangul blocks 503 * to statistically minimize the following difference 504 */ 505 diff=c-prev; 506 prev=BOCU1_PREV(c); 507 if(DIFF_IS_SINGLE(diff)) { 508 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 509 *offsets++=sourceIndex; 510 --targetCapacity; 511 sourceIndex=nextSourceIndex; 512 if(c<0x3000) { 513 goto fastSingle; 514 } 515 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { 516 /* optimize 2-byte case */ 517 int32_t m; 518 519 if(diff>=0) { 520 diff-=BOCU1_REACH_POS_1+1; 521 m=diff%BOCU1_TRAIL_COUNT; 522 diff/=BOCU1_TRAIL_COUNT; 523 diff+=BOCU1_START_POS_2; 524 } else { 525 diff-=BOCU1_REACH_NEG_1; 526 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 527 diff+=BOCU1_START_NEG_2; 528 } 529 *target++=(uint8_t)diff; 530 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); 531 *offsets++=sourceIndex; 532 *offsets++=sourceIndex; 533 targetCapacity-=2; 534 sourceIndex=nextSourceIndex; 535 } else { 536 int32_t length; /* will be 2..4 */ 537 538 diff=packDiff(diff); 539 length=BOCU1_LENGTH_FROM_PACKED(diff); 540 541 /* write the output character bytes from diff and length */ 542 /* from the first if in the loop we know that targetCapacity>0 */ 543 if(length<=targetCapacity) { 544 switch(length) { 545 /* each branch falls through to the next one */ 546 case 4: 547 *target++=(uint8_t)(diff>>24); 548 *offsets++=sourceIndex; 549 case 3: 550 *target++=(uint8_t)(diff>>16); 551 *offsets++=sourceIndex; 552 case 2: 553 *target++=(uint8_t)(diff>>8); 554 *offsets++=sourceIndex; 555 /* case 1: handled above */ 556 *target++=(uint8_t)diff; 557 *offsets++=sourceIndex; 558 default: 559 /* will never occur */ 560 break; 561 } 562 targetCapacity-=length; 563 sourceIndex=nextSourceIndex; 564 } else { 565 uint8_t *charErrorBuffer; 566 567 /* 568 * We actually do this backwards here: 569 * In order to save an intermediate variable, we output 570 * first to the overflow buffer what does not fit into the 571 * regular target. 572 */ 573 /* we know that 1<=targetCapacity<length<=4 */ 574 length-=targetCapacity; 575 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 576 switch(length) { 577 /* each branch falls through to the next one */ 578 case 3: 579 *charErrorBuffer++=(uint8_t)(diff>>16); 580 case 2: 581 *charErrorBuffer++=(uint8_t)(diff>>8); 582 case 1: 583 *charErrorBuffer=(uint8_t)diff; 584 default: 585 /* will never occur */ 586 break; 587 } 588 cnv->charErrorBufferLength=(int8_t)length; 589 590 /* now output what fits into the regular target */ 591 diff>>=8*length; /* length was reduced by targetCapacity */ 592 switch(targetCapacity) { 593 /* each branch falls through to the next one */ 594 case 3: 595 *target++=(uint8_t)(diff>>16); 596 *offsets++=sourceIndex; 597 case 2: 598 *target++=(uint8_t)(diff>>8); 599 *offsets++=sourceIndex; 600 case 1: 601 *target++=(uint8_t)diff; 602 *offsets++=sourceIndex; 603 default: 604 /* will never occur */ 605 break; 606 } 607 608 /* target overflow */ 609 targetCapacity=0; 610 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 611 break; 612 } 613 } 614 } else { 615 /* target is full */ 616 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 617 break; 618 } 619 } 620 621 /* set the converter state back into UConverter */ 622 cnv->fromUChar32= c<0 ? -c : 0; 623 cnv->fromUnicodeStatus=(uint32_t)prev; 624 625 /* write back the updated pointers */ 626 pArgs->source=source; 627 pArgs->target=(char *)target; 628 pArgs->offsets=offsets; 629 } 630 631 /* 632 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling. 633 * If a change is made in the original function, then either 634 * change this function the same way or 635 * re-copy the original function and remove the variables 636 * offsets, sourceIndex, and nextSourceIndex. 637 */ 638 static void 639 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs, 640 UErrorCode *pErrorCode) { 641 UConverter *cnv; 642 const UChar *source, *sourceLimit; 643 uint8_t *target; 644 int32_t targetCapacity; 645 646 int32_t prev, c, diff; 647 648 /* set up the local pointers */ 649 cnv=pArgs->converter; 650 source=pArgs->source; 651 sourceLimit=pArgs->sourceLimit; 652 target=(uint8_t *)pArgs->target; 653 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 654 655 /* get the converter state from UConverter */ 656 c=cnv->fromUChar32; 657 prev=(int32_t)cnv->fromUnicodeStatus; 658 if(prev==0) { 659 prev=BOCU1_ASCII_PREV; 660 } 661 662 /* conversion loop */ 663 if(c!=0 && targetCapacity>0) { 664 goto getTrail; 665 } 666 667 fastSingle: 668 /* fast loop for single-byte differences */ 669 /* use only one loop counter variable, targetCapacity, not also source */ 670 diff=(int32_t)(sourceLimit-source); 671 if(targetCapacity>diff) { 672 targetCapacity=diff; 673 } 674 while(targetCapacity>0 && (c=*source)<0x3000) { 675 if(c<=0x20) { 676 if(c!=0x20) { 677 prev=BOCU1_ASCII_PREV; 678 } 679 *target++=(uint8_t)c; 680 } else { 681 diff=c-prev; 682 if(DIFF_IS_SINGLE(diff)) { 683 prev=BOCU1_SIMPLE_PREV(c); 684 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 685 } else { 686 break; 687 } 688 } 689 ++source; 690 --targetCapacity; 691 } 692 /* restore real values */ 693 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); 694 695 /* regular loop for all cases */ 696 while(source<sourceLimit) { 697 if(targetCapacity>0) { 698 c=*source++; 699 700 if(c<=0x20) { 701 /* 702 * ISO C0 control & space: 703 * Encode directly for MIME compatibility, 704 * and reset state except for space, to not disrupt compression. 705 */ 706 if(c!=0x20) { 707 prev=BOCU1_ASCII_PREV; 708 } 709 *target++=(uint8_t)c; 710 --targetCapacity; 711 continue; 712 } 713 714 if(UTF_IS_LEAD(c)) { 715 getTrail: 716 if(source<sourceLimit) { 717 /* test the following code unit */ 718 UChar trail=*source; 719 if(UTF_IS_SECOND_SURROGATE(trail)) { 720 ++source; 721 c=UTF16_GET_PAIR_VALUE(c, trail); 722 } 723 } else { 724 /* no more input */ 725 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ 726 break; 727 } 728 } 729 730 /* 731 * all other Unicode code points c==U+0021..U+10ffff 732 * are encoded with the difference c-prev 733 * 734 * a new prev is computed from c, 735 * placed in the middle of a 0x80-block (for most small scripts) or 736 * in the middle of the Unihan and Hangul blocks 737 * to statistically minimize the following difference 738 */ 739 diff=c-prev; 740 prev=BOCU1_PREV(c); 741 if(DIFF_IS_SINGLE(diff)) { 742 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 743 --targetCapacity; 744 if(c<0x3000) { 745 goto fastSingle; 746 } 747 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { 748 /* optimize 2-byte case */ 749 int32_t m; 750 751 if(diff>=0) { 752 diff-=BOCU1_REACH_POS_1+1; 753 m=diff%BOCU1_TRAIL_COUNT; 754 diff/=BOCU1_TRAIL_COUNT; 755 diff+=BOCU1_START_POS_2; 756 } else { 757 diff-=BOCU1_REACH_NEG_1; 758 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 759 diff+=BOCU1_START_NEG_2; 760 } 761 *target++=(uint8_t)diff; 762 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); 763 targetCapacity-=2; 764 } else { 765 int32_t length; /* will be 2..4 */ 766 767 diff=packDiff(diff); 768 length=BOCU1_LENGTH_FROM_PACKED(diff); 769 770 /* write the output character bytes from diff and length */ 771 /* from the first if in the loop we know that targetCapacity>0 */ 772 if(length<=targetCapacity) { 773 switch(length) { 774 /* each branch falls through to the next one */ 775 case 4: 776 *target++=(uint8_t)(diff>>24); 777 case 3: 778 *target++=(uint8_t)(diff>>16); 779 /* case 2: handled above */ 780 *target++=(uint8_t)(diff>>8); 781 /* case 1: handled above */ 782 *target++=(uint8_t)diff; 783 default: 784 /* will never occur */ 785 break; 786 } 787 targetCapacity-=length; 788 } else { 789 uint8_t *charErrorBuffer; 790 791 /* 792 * We actually do this backwards here: 793 * In order to save an intermediate variable, we output 794 * first to the overflow buffer what does not fit into the 795 * regular target. 796 */ 797 /* we know that 1<=targetCapacity<length<=4 */ 798 length-=targetCapacity; 799 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 800 switch(length) { 801 /* each branch falls through to the next one */ 802 case 3: 803 *charErrorBuffer++=(uint8_t)(diff>>16); 804 case 2: 805 *charErrorBuffer++=(uint8_t)(diff>>8); 806 case 1: 807 *charErrorBuffer=(uint8_t)diff; 808 default: 809 /* will never occur */ 810 break; 811 } 812 cnv->charErrorBufferLength=(int8_t)length; 813 814 /* now output what fits into the regular target */ 815 diff>>=8*length; /* length was reduced by targetCapacity */ 816 switch(targetCapacity) { 817 /* each branch falls through to the next one */ 818 case 3: 819 *target++=(uint8_t)(diff>>16); 820 case 2: 821 *target++=(uint8_t)(diff>>8); 822 case 1: 823 *target++=(uint8_t)diff; 824 default: 825 /* will never occur */ 826 break; 827 } 828 829 /* target overflow */ 830 targetCapacity=0; 831 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 832 break; 833 } 834 } 835 } else { 836 /* target is full */ 837 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 838 break; 839 } 840 } 841 842 /* set the converter state back into UConverter */ 843 cnv->fromUChar32= c<0 ? -c : 0; 844 cnv->fromUnicodeStatus=(uint32_t)prev; 845 846 /* write back the updated pointers */ 847 pArgs->source=source; 848 pArgs->target=(char *)target; 849 } 850 851 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */ 852 853 /** 854 * Function for BOCU-1 decoder; handles multi-byte lead bytes. 855 * 856 * @param b lead byte; 857 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD 858 * @return (diff<<2)|count 859 */ 860 static U_INLINE int32_t 861 decodeBocu1LeadByte(int32_t b) { 862 int32_t diff, count; 863 864 if(b>=BOCU1_START_NEG_2) { 865 /* positive difference */ 866 if(b<BOCU1_START_POS_3) { 867 /* two bytes */ 868 diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 869 count=1; 870 } else if(b<BOCU1_START_POS_4) { 871 /* three bytes */ 872 diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1; 873 count=2; 874 } else { 875 /* four bytes */ 876 diff=BOCU1_REACH_POS_3+1; 877 count=3; 878 } 879 } else { 880 /* negative difference */ 881 if(b>=BOCU1_START_NEG_3) { 882 /* two bytes */ 883 diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 884 count=1; 885 } else if(b>BOCU1_MIN) { 886 /* three bytes */ 887 diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2; 888 count=2; 889 } else { 890 /* four bytes */ 891 diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; 892 count=3; 893 } 894 } 895 896 /* return the state for decoding the trail byte(s) */ 897 return (diff<<2)|count; 898 } 899 900 /** 901 * Function for BOCU-1 decoder; handles multi-byte trail bytes. 902 * 903 * @param count number of remaining trail bytes including this one 904 * @param b trail byte 905 * @return new delta for diff including b - <0 indicates an error 906 * 907 * @see decodeBocu1 908 */ 909 static U_INLINE int32_t 910 decodeBocu1TrailByte(int32_t count, int32_t b) { 911 if(b<=0x20) { 912 /* skip some C0 controls and make the trail byte range contiguous */ 913 b=bocu1ByteToTrail[b]; 914 /* b<0 for an illegal trail byte value will result in return<0 below */ 915 #if BOCU1_MAX_TRAIL<0xff 916 } else if(b>BOCU1_MAX_TRAIL) { 917 return -99; 918 #endif 919 } else { 920 b-=BOCU1_TRAIL_BYTE_OFFSET; 921 } 922 923 /* add trail byte into difference and decrement count */ 924 if(count==1) { 925 return b; 926 } else if(count==2) { 927 return b*BOCU1_TRAIL_COUNT; 928 } else /* count==3 */ { 929 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); 930 } 931 } 932 933 static void 934 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 935 UErrorCode *pErrorCode) { 936 UConverter *cnv; 937 const uint8_t *source, *sourceLimit; 938 UChar *target; 939 const UChar *targetLimit; 940 int32_t *offsets; 941 942 int32_t prev, count, diff, c; 943 944 int8_t byteIndex; 945 uint8_t *bytes; 946 947 int32_t sourceIndex, nextSourceIndex; 948 949 /* set up the local pointers */ 950 cnv=pArgs->converter; 951 source=(const uint8_t *)pArgs->source; 952 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 953 target=pArgs->target; 954 targetLimit=pArgs->targetLimit; 955 offsets=pArgs->offsets; 956 957 /* get the converter state from UConverter */ 958 prev=(int32_t)cnv->toUnicodeStatus; 959 if(prev==0) { 960 prev=BOCU1_ASCII_PREV; 961 } 962 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ 963 count=diff&3; 964 diff>>=2; 965 966 byteIndex=cnv->toULength; 967 bytes=cnv->toUBytes; 968 969 /* sourceIndex=-1 if the current character began in the previous buffer */ 970 sourceIndex=byteIndex==0 ? 0 : -1; 971 nextSourceIndex=0; 972 973 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ 974 if(count>0 && byteIndex>0 && target<targetLimit) { 975 goto getTrail; 976 } 977 978 fastSingle: 979 /* fast loop for single-byte differences */ 980 /* use count as the only loop counter variable */ 981 diff=(int32_t)(sourceLimit-source); 982 count=(int32_t)(pArgs->targetLimit-target); 983 if(count>diff) { 984 count=diff; 985 } 986 while(count>0) { 987 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { 988 c=prev+(c-BOCU1_MIDDLE); 989 if(c<0x3000) { 990 *target++=(UChar)c; 991 *offsets++=nextSourceIndex++; 992 prev=BOCU1_SIMPLE_PREV(c); 993 } else { 994 break; 995 } 996 } else if(c<=0x20) { 997 if(c!=0x20) { 998 prev=BOCU1_ASCII_PREV; 999 } 1000 *target++=(UChar)c; 1001 *offsets++=nextSourceIndex++; 1002 } else { 1003 break; 1004 } 1005 ++source; 1006 --count; 1007 } 1008 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ 1009 1010 /* decode a sequence of single and lead bytes */ 1011 while(source<sourceLimit) { 1012 if(target>=targetLimit) { 1013 /* target is full */ 1014 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1015 break; 1016 } 1017 1018 ++nextSourceIndex; 1019 c=*source++; 1020 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { 1021 /* Write a code point directly from a single-byte difference. */ 1022 c=prev+(c-BOCU1_MIDDLE); 1023 if(c<0x3000) { 1024 *target++=(UChar)c; 1025 *offsets++=sourceIndex; 1026 prev=BOCU1_SIMPLE_PREV(c); 1027 sourceIndex=nextSourceIndex; 1028 goto fastSingle; 1029 } 1030 } else if(c<=0x20) { 1031 /* 1032 * Direct-encoded C0 control code or space. 1033 * Reset prev for C0 control codes but not for space. 1034 */ 1035 if(c!=0x20) { 1036 prev=BOCU1_ASCII_PREV; 1037 } 1038 *target++=(UChar)c; 1039 *offsets++=sourceIndex; 1040 sourceIndex=nextSourceIndex; 1041 continue; 1042 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { 1043 /* Optimize two-byte case. */ 1044 if(c>=BOCU1_MIDDLE) { 1045 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 1046 } else { 1047 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 1048 } 1049 1050 /* trail byte */ 1051 ++nextSourceIndex; 1052 c=decodeBocu1TrailByte(1, *source++); 1053 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { 1054 bytes[0]=source[-2]; 1055 bytes[1]=source[-1]; 1056 byteIndex=2; 1057 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1058 break; 1059 } 1060 } else if(c==BOCU1_RESET) { 1061 /* only reset the state, no code point */ 1062 prev=BOCU1_ASCII_PREV; 1063 sourceIndex=nextSourceIndex; 1064 continue; 1065 } else { 1066 /* 1067 * For multi-byte difference lead bytes, set the decoder state 1068 * with the partial difference value from the lead byte and 1069 * with the number of trail bytes. 1070 */ 1071 bytes[0]=(uint8_t)c; 1072 byteIndex=1; 1073 1074 diff=decodeBocu1LeadByte(c); 1075 count=diff&3; 1076 diff>>=2; 1077 getTrail: 1078 for(;;) { 1079 if(source>=sourceLimit) { 1080 goto endloop; 1081 } 1082 ++nextSourceIndex; 1083 c=bytes[byteIndex++]=*source++; 1084 1085 /* trail byte in any position */ 1086 c=decodeBocu1TrailByte(count, c); 1087 if(c<0) { 1088 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1089 goto endloop; 1090 } 1091 1092 diff+=c; 1093 if(--count==0) { 1094 /* final trail byte, deliver a code point */ 1095 byteIndex=0; 1096 c=prev+diff; 1097 if((uint32_t)c>0x10ffff) { 1098 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1099 goto endloop; 1100 } 1101 break; 1102 } 1103 } 1104 } 1105 1106 /* calculate the next prev and output c */ 1107 prev=BOCU1_PREV(c); 1108 if(c<=0xffff) { 1109 *target++=(UChar)c; 1110 *offsets++=sourceIndex; 1111 } else { 1112 /* output surrogate pair */ 1113 *target++=UTF16_LEAD(c); 1114 if(target<targetLimit) { 1115 *target++=UTF16_TRAIL(c); 1116 *offsets++=sourceIndex; 1117 *offsets++=sourceIndex; 1118 } else { 1119 /* target overflow */ 1120 *offsets++=sourceIndex; 1121 cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c); 1122 cnv->UCharErrorBufferLength=1; 1123 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1124 break; 1125 } 1126 } 1127 sourceIndex=nextSourceIndex; 1128 } 1129 endloop: 1130 1131 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { 1132 /* set the converter state in UConverter to deal with the next character */ 1133 cnv->toUnicodeStatus=BOCU1_ASCII_PREV; 1134 cnv->mode=0; 1135 } else { 1136 /* set the converter state back into UConverter */ 1137 cnv->toUnicodeStatus=(uint32_t)prev; 1138 cnv->mode=(diff<<2)|count; 1139 } 1140 cnv->toULength=byteIndex; 1141 1142 /* write back the updated pointers */ 1143 pArgs->source=(const char *)source; 1144 pArgs->target=target; 1145 pArgs->offsets=offsets; 1146 return; 1147 } 1148 1149 /* 1150 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling. 1151 * If a change is made in the original function, then either 1152 * change this function the same way or 1153 * re-copy the original function and remove the variables 1154 * offsets, sourceIndex, and nextSourceIndex. 1155 */ 1156 static void 1157 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs, 1158 UErrorCode *pErrorCode) { 1159 UConverter *cnv; 1160 const uint8_t *source, *sourceLimit; 1161 UChar *target; 1162 const UChar *targetLimit; 1163 1164 int32_t prev, count, diff, c; 1165 1166 int8_t byteIndex; 1167 uint8_t *bytes; 1168 1169 U_ALIGN_CODE(16) 1170 1171 /* set up the local pointers */ 1172 cnv=pArgs->converter; 1173 source=(const uint8_t *)pArgs->source; 1174 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1175 target=pArgs->target; 1176 targetLimit=pArgs->targetLimit; 1177 1178 /* get the converter state from UConverter */ 1179 prev=(int32_t)cnv->toUnicodeStatus; 1180 if(prev==0) { 1181 prev=BOCU1_ASCII_PREV; 1182 } 1183 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ 1184 count=diff&3; 1185 diff>>=2; 1186 1187 byteIndex=cnv->toULength; 1188 bytes=cnv->toUBytes; 1189 1190 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ 1191 if(count>0 && byteIndex>0 && target<targetLimit) { 1192 goto getTrail; 1193 } 1194 1195 fastSingle: 1196 /* fast loop for single-byte differences */ 1197 /* use count as the only loop counter variable */ 1198 diff=(int32_t)(sourceLimit-source); 1199 count=(int32_t)(pArgs->targetLimit-target); 1200 if(count>diff) { 1201 count=diff; 1202 } 1203 while(count>0) { 1204 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { 1205 c=prev+(c-BOCU1_MIDDLE); 1206 if(c<0x3000) { 1207 *target++=(UChar)c; 1208 prev=BOCU1_SIMPLE_PREV(c); 1209 } else { 1210 break; 1211 } 1212 } else if(c<=0x20) { 1213 if(c!=0x20) { 1214 prev=BOCU1_ASCII_PREV; 1215 } 1216 *target++=(UChar)c; 1217 } else { 1218 break; 1219 } 1220 ++source; 1221 --count; 1222 } 1223 1224 /* decode a sequence of single and lead bytes */ 1225 while(source<sourceLimit) { 1226 if(target>=targetLimit) { 1227 /* target is full */ 1228 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1229 break; 1230 } 1231 1232 c=*source++; 1233 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { 1234 /* Write a code point directly from a single-byte difference. */ 1235 c=prev+(c-BOCU1_MIDDLE); 1236 if(c<0x3000) { 1237 *target++=(UChar)c; 1238 prev=BOCU1_SIMPLE_PREV(c); 1239 goto fastSingle; 1240 } 1241 } else if(c<=0x20) { 1242 /* 1243 * Direct-encoded C0 control code or space. 1244 * Reset prev for C0 control codes but not for space. 1245 */ 1246 if(c!=0x20) { 1247 prev=BOCU1_ASCII_PREV; 1248 } 1249 *target++=(UChar)c; 1250 continue; 1251 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { 1252 /* Optimize two-byte case. */ 1253 if(c>=BOCU1_MIDDLE) { 1254 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 1255 } else { 1256 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 1257 } 1258 1259 /* trail byte */ 1260 c=decodeBocu1TrailByte(1, *source++); 1261 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { 1262 bytes[0]=source[-2]; 1263 bytes[1]=source[-1]; 1264 byteIndex=2; 1265 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1266 break; 1267 } 1268 } else if(c==BOCU1_RESET) { 1269 /* only reset the state, no code point */ 1270 prev=BOCU1_ASCII_PREV; 1271 continue; 1272 } else { 1273 /* 1274 * For multi-byte difference lead bytes, set the decoder state 1275 * with the partial difference value from the lead byte and 1276 * with the number of trail bytes. 1277 */ 1278 bytes[0]=(uint8_t)c; 1279 byteIndex=1; 1280 1281 diff=decodeBocu1LeadByte(c); 1282 count=diff&3; 1283 diff>>=2; 1284 getTrail: 1285 for(;;) { 1286 if(source>=sourceLimit) { 1287 goto endloop; 1288 } 1289 c=bytes[byteIndex++]=*source++; 1290 1291 /* trail byte in any position */ 1292 c=decodeBocu1TrailByte(count, c); 1293 if(c<0) { 1294 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1295 goto endloop; 1296 } 1297 1298 diff+=c; 1299 if(--count==0) { 1300 /* final trail byte, deliver a code point */ 1301 byteIndex=0; 1302 c=prev+diff; 1303 if((uint32_t)c>0x10ffff) { 1304 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1305 goto endloop; 1306 } 1307 break; 1308 } 1309 } 1310 } 1311 1312 /* calculate the next prev and output c */ 1313 prev=BOCU1_PREV(c); 1314 if(c<=0xffff) { 1315 *target++=(UChar)c; 1316 } else { 1317 /* output surrogate pair */ 1318 *target++=UTF16_LEAD(c); 1319 if(target<targetLimit) { 1320 *target++=UTF16_TRAIL(c); 1321 } else { 1322 /* target overflow */ 1323 cnv->UCharErrorBuffer[0]=UTF16_TRAIL(c); 1324 cnv->UCharErrorBufferLength=1; 1325 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1326 break; 1327 } 1328 } 1329 } 1330 endloop: 1331 1332 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { 1333 /* set the converter state in UConverter to deal with the next character */ 1334 cnv->toUnicodeStatus=BOCU1_ASCII_PREV; 1335 cnv->mode=0; 1336 } else { 1337 /* set the converter state back into UConverter */ 1338 cnv->toUnicodeStatus=(uint32_t)prev; 1339 cnv->mode=(diff<<2)|count; 1340 } 1341 cnv->toULength=byteIndex; 1342 1343 /* write back the updated pointers */ 1344 pArgs->source=(const char *)source; 1345 pArgs->target=target; 1346 return; 1347 } 1348 1349 /* miscellaneous ------------------------------------------------------------ */ 1350 1351 static const UConverterImpl _Bocu1Impl={ 1352 UCNV_BOCU1, 1353 1354 NULL, 1355 NULL, 1356 1357 NULL, 1358 NULL, 1359 NULL, 1360 1361 _Bocu1ToUnicode, 1362 _Bocu1ToUnicodeWithOffsets, 1363 _Bocu1FromUnicode, 1364 _Bocu1FromUnicodeWithOffsets, 1365 NULL, 1366 1367 NULL, 1368 NULL, 1369 NULL, 1370 NULL, 1371 ucnv_getCompleteUnicodeSet 1372 }; 1373 1374 static const UConverterStaticData _Bocu1StaticData={ 1375 sizeof(UConverterStaticData), 1376 "BOCU-1", 1377 1214, /* CCSID for BOCU-1 */ 1378 UCNV_IBM, UCNV_BOCU1, 1379 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */ 1380 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */ 1381 FALSE, FALSE, 1382 0, 1383 0, 1384 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1385 }; 1386 1387 const UConverterSharedData _Bocu1Data={ 1388 sizeof(UConverterSharedData), ~((uint32_t)0), 1389 NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl, 1390 0 1391 }; 1392 1393 #endif 1394