1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2002-2011, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: ucnvbocu.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002mar27 14 * created by: Markus W. Scherer 15 * 16 * This is an implementation of the Binary Ordered Compression for Unicode, 17 * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/ 18 */ 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_CONVERSION 23 24 #include "unicode/ucnv.h" 25 #include "unicode/ucnv_cb.h" 26 #include "unicode/utf16.h" 27 #include "putilimp.h" 28 #include "ucnv_bld.h" 29 #include "ucnv_cnv.h" 30 #include "uassert.h" 31 32 /* BOCU-1 constants and macros ---------------------------------------------- */ 33 34 /* 35 * BOCU-1 encodes the code points of a Unicode string as 36 * a sequence of byte-encoded differences (slope detection), 37 * preserving lexical order. 38 * 39 * Optimize the difference-taking for runs of Unicode text within 40 * small scripts: 41 * 42 * Most small scripts are allocated within aligned 128-blocks of Unicode 43 * code points. Lexical order is preserved if the "previous code point" state 44 * is always moved into the middle of such a block. 45 * 46 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul 47 * areas into the middle of those areas. 48 * 49 * C0 control codes and space are encoded with their US-ASCII bytes. 50 * "prev" is reset for C0 controls but not for space. 51 */ 52 53 /* initial value for "prev": middle of the ASCII range */ 54 #define BOCU1_ASCII_PREV 0x40 55 56 /* bounding byte values for differences */ 57 #define BOCU1_MIN 0x21 58 #define BOCU1_MIDDLE 0x90 59 #define BOCU1_MAX_LEAD 0xfe 60 #define BOCU1_MAX_TRAIL 0xff 61 #define BOCU1_RESET 0xff 62 63 /* number of lead bytes */ 64 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) 65 66 /* adjust trail byte counts for the use of some C0 control byte values */ 67 #define BOCU1_TRAIL_CONTROLS_COUNT 20 68 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) 69 70 /* number of trail bytes */ 71 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) 72 73 /* 74 * number of positive and negative single-byte codes 75 * (counting 0==BOCU1_MIDDLE among the positive ones) 76 */ 77 #define BOCU1_SINGLE 64 78 79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */ 80 #define BOCU1_LEAD_2 43 81 #define BOCU1_LEAD_3 3 82 #define BOCU1_LEAD_4 1 83 84 /* The difference value range for single-byters. */ 85 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) 86 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) 87 88 /* The difference value range for double-byters. */ 89 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) 90 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) 91 92 /* The difference value range for 3-byters. */ 93 #define BOCU1_REACH_POS_3 \ 94 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) 95 96 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) 97 98 /* The lead byte start values. */ 99 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) 100 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) 101 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) 102 /* ==BOCU1_MAX_LEAD */ 103 104 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) 105 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) 106 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) 107 /* ==BOCU1_MIN+1 */ 108 109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ 110 #define BOCU1_LENGTH_FROM_LEAD(lead) \ 111 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \ 112 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \ 113 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4) 114 115 /* The length of a byte sequence, according to its packed form. */ 116 #define BOCU1_LENGTH_FROM_PACKED(packed) \ 117 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4) 118 119 /* 120 * 12 commonly used C0 control codes (and space) are only used to encode 121 * themselves directly, 122 * which makes BOCU-1 MIME-usable and reasonably safe for 123 * ASCII-oriented software. 124 * 125 * These controls are 126 * 0 NUL 127 * 128 * 7 BEL 129 * 8 BS 130 * 131 * 9 TAB 132 * a LF 133 * b VT 134 * c FF 135 * d CR 136 * 137 * e SO 138 * f SI 139 * 140 * 1a SUB 141 * 1b ESC 142 * 143 * The other 20 C0 controls are also encoded directly (to preserve order) 144 * but are also used as trail bytes in difference encoding 145 * (for better compression). 146 */ 147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) 148 149 /* 150 * Byte value map for control codes, 151 * from external byte values 0x00..0x20 152 * to trail byte values 0..19 (0..0x13) as used in the difference calculation. 153 * External byte values that are illegal as trail bytes are mapped to -1. 154 */ 155 static const int8_t 156 bocu1ByteToTrail[BOCU1_MIN]={ 157 /* 0 1 2 3 4 5 6 7 */ 158 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, 159 160 /* 8 9 a b c d e f */ 161 -1, -1, -1, -1, -1, -1, -1, -1, 162 163 /* 10 11 12 13 14 15 16 17 */ 164 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 165 166 /* 18 19 1a 1b 1c 1d 1e 1f */ 167 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, 168 169 /* 20 */ 170 -1 171 }; 172 173 /* 174 * Byte value map for control codes, 175 * from trail byte values 0..19 (0..0x13) as used in the difference calculation 176 * to external byte values 0x00..0x20. 177 */ 178 static const int8_t 179 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ 180 /* 0 1 2 3 4 5 6 7 */ 181 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, 182 183 /* 8 9 a b c d e f */ 184 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 185 186 /* 10 11 12 13 */ 187 0x1c, 0x1d, 0x1e, 0x1f 188 }; 189 190 /** 191 * Integer division and modulo with negative numerators 192 * yields negative modulo results and quotients that are one more than 193 * what we need here. 194 * This macro adjust the results so that the modulo-value m is always >=0. 195 * 196 * For positive n, the if() condition is always FALSE. 197 * 198 * @param n Number to be split into quotient and rest. 199 * Will be modified to contain the quotient. 200 * @param d Divisor. 201 * @param m Output variable for the rest (modulo result). 202 */ 203 #define NEGDIVMOD(n, d, m) { \ 204 (m)=(n)%(d); \ 205 (n)/=(d); \ 206 if((m)<0) { \ 207 --(n); \ 208 (m)+=(d); \ 209 } \ 210 } 211 212 /* Faster versions of packDiff() for single-byte-encoded diff values. */ 213 214 /** Is a diff value encodable in a single byte? */ 215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1) 216 217 /** Encode a diff value in a single byte. */ 218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff)) 219 220 /** Is a diff value encodable in two bytes? */ 221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2) 222 223 /* BOCU-1 implementation functions ------------------------------------------ */ 224 225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV) 226 227 /** 228 * Compute the next "previous" value for differencing 229 * from the current code point. 230 * 231 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below) 232 * @return "previous code point" state value 233 */ 234 static inline int32_t 235 bocu1Prev(int32_t c) { 236 /* compute new prev */ 237 if(/* 0x3040<=c && */ c<=0x309f) { 238 /* Hiragana is not 128-aligned */ 239 return 0x3070; 240 } else if(0x4e00<=c && c<=0x9fa5) { 241 /* CJK Unihan */ 242 return 0x4e00-BOCU1_REACH_NEG_2; 243 } else if(0xac00<=c /* && c<=0xd7a3 */) { 244 /* Korean Hangul */ 245 return (0xd7a3+0xac00)/2; 246 } else { 247 /* mostly small scripts */ 248 return BOCU1_SIMPLE_PREV(c); 249 } 250 } 251 252 /** Fast version of bocu1Prev() for most scripts. */ 253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c)) 254 255 /* 256 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c. 257 * The UConverter fields are used as follows: 258 * 259 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) 260 * 261 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) 262 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0) 263 */ 264 265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */ 266 267 /** 268 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes 269 * and return a packed integer with them. 270 * 271 * The encoding favors small absolute differences with short encodings 272 * to compress runs of same-script characters. 273 * 274 * Optimized version with unrolled loops and fewer floating-point operations 275 * than the standard packDiff(). 276 * 277 * @param diff difference value -0x10ffff..0x10ffff 278 * @return 279 * 0x010000zz for 1-byte sequence zz 280 * 0x0200yyzz for 2-byte sequence yy zz 281 * 0x03xxyyzz for 3-byte sequence xx yy zz 282 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) 283 */ 284 static int32_t 285 packDiff(int32_t diff) { 286 int32_t result, m; 287 288 U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */ 289 if(diff>=BOCU1_REACH_NEG_1) { 290 /* mostly positive differences, and single-byte negative ones */ 291 #if 0 /* single-byte case handled in macros, see below */ 292 if(diff<=BOCU1_REACH_POS_1) { 293 /* single byte */ 294 return 0x01000000|(BOCU1_MIDDLE+diff); 295 } else 296 #endif 297 if(diff<=BOCU1_REACH_POS_2) { 298 /* two bytes */ 299 diff-=BOCU1_REACH_POS_1+1; 300 result=0x02000000; 301 302 m=diff%BOCU1_TRAIL_COUNT; 303 diff/=BOCU1_TRAIL_COUNT; 304 result|=BOCU1_TRAIL_TO_BYTE(m); 305 306 result|=(BOCU1_START_POS_2+diff)<<8; 307 } else if(diff<=BOCU1_REACH_POS_3) { 308 /* three bytes */ 309 diff-=BOCU1_REACH_POS_2+1; 310 result=0x03000000; 311 312 m=diff%BOCU1_TRAIL_COUNT; 313 diff/=BOCU1_TRAIL_COUNT; 314 result|=BOCU1_TRAIL_TO_BYTE(m); 315 316 m=diff%BOCU1_TRAIL_COUNT; 317 diff/=BOCU1_TRAIL_COUNT; 318 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 319 320 result|=(BOCU1_START_POS_3+diff)<<16; 321 } else { 322 /* four bytes */ 323 diff-=BOCU1_REACH_POS_3+1; 324 325 m=diff%BOCU1_TRAIL_COUNT; 326 diff/=BOCU1_TRAIL_COUNT; 327 result=BOCU1_TRAIL_TO_BYTE(m); 328 329 m=diff%BOCU1_TRAIL_COUNT; 330 diff/=BOCU1_TRAIL_COUNT; 331 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 332 333 /* 334 * We know that / and % would deliver quotient 0 and rest=diff. 335 * Avoid division and modulo for performance. 336 */ 337 result|=BOCU1_TRAIL_TO_BYTE(diff)<<16; 338 339 result|=((uint32_t)BOCU1_START_POS_4)<<24; 340 } 341 } else { 342 /* two- to four-byte negative differences */ 343 if(diff>=BOCU1_REACH_NEG_2) { 344 /* two bytes */ 345 diff-=BOCU1_REACH_NEG_1; 346 result=0x02000000; 347 348 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 349 result|=BOCU1_TRAIL_TO_BYTE(m); 350 351 result|=(BOCU1_START_NEG_2+diff)<<8; 352 } else if(diff>=BOCU1_REACH_NEG_3) { 353 /* three bytes */ 354 diff-=BOCU1_REACH_NEG_2; 355 result=0x03000000; 356 357 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 358 result|=BOCU1_TRAIL_TO_BYTE(m); 359 360 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 361 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 362 363 result|=(BOCU1_START_NEG_3+diff)<<16; 364 } else { 365 /* four bytes */ 366 diff-=BOCU1_REACH_NEG_3; 367 368 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 369 result=BOCU1_TRAIL_TO_BYTE(m); 370 371 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 372 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 373 374 /* 375 * We know that NEGDIVMOD would deliver 376 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT. 377 * Avoid division and modulo for performance. 378 */ 379 m=diff+BOCU1_TRAIL_COUNT; 380 result|=BOCU1_TRAIL_TO_BYTE(m)<<16; 381 382 result|=BOCU1_MIN<<24; 383 } 384 } 385 return result; 386 } 387 388 389 static void 390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 391 UErrorCode *pErrorCode) { 392 UConverter *cnv; 393 const UChar *source, *sourceLimit; 394 uint8_t *target; 395 int32_t targetCapacity; 396 int32_t *offsets; 397 398 int32_t prev, c, diff; 399 400 int32_t sourceIndex, nextSourceIndex; 401 402 U_ALIGN_CODE(16) 403 404 /* set up the local pointers */ 405 cnv=pArgs->converter; 406 source=pArgs->source; 407 sourceLimit=pArgs->sourceLimit; 408 target=(uint8_t *)pArgs->target; 409 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 410 offsets=pArgs->offsets; 411 412 /* get the converter state from UConverter */ 413 c=cnv->fromUChar32; 414 prev=(int32_t)cnv->fromUnicodeStatus; 415 if(prev==0) { 416 prev=BOCU1_ASCII_PREV; 417 } 418 419 /* sourceIndex=-1 if the current character began in the previous buffer */ 420 sourceIndex= c==0 ? 0 : -1; 421 nextSourceIndex=0; 422 423 /* conversion loop */ 424 if(c!=0 && targetCapacity>0) { 425 goto getTrail; 426 } 427 428 fastSingle: 429 /* fast loop for single-byte differences */ 430 /* use only one loop counter variable, targetCapacity, not also source */ 431 diff=(int32_t)(sourceLimit-source); 432 if(targetCapacity>diff) { 433 targetCapacity=diff; 434 } 435 while(targetCapacity>0 && (c=*source)<0x3000) { 436 if(c<=0x20) { 437 if(c!=0x20) { 438 prev=BOCU1_ASCII_PREV; 439 } 440 *target++=(uint8_t)c; 441 *offsets++=nextSourceIndex++; 442 ++source; 443 --targetCapacity; 444 } else { 445 diff=c-prev; 446 if(DIFF_IS_SINGLE(diff)) { 447 prev=BOCU1_SIMPLE_PREV(c); 448 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 449 *offsets++=nextSourceIndex++; 450 ++source; 451 --targetCapacity; 452 } else { 453 break; 454 } 455 } 456 } 457 /* restore real values */ 458 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); 459 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ 460 461 /* regular loop for all cases */ 462 while(source<sourceLimit) { 463 if(targetCapacity>0) { 464 c=*source++; 465 ++nextSourceIndex; 466 467 if(c<=0x20) { 468 /* 469 * ISO C0 control & space: 470 * Encode directly for MIME compatibility, 471 * and reset state except for space, to not disrupt compression. 472 */ 473 if(c!=0x20) { 474 prev=BOCU1_ASCII_PREV; 475 } 476 *target++=(uint8_t)c; 477 *offsets++=sourceIndex; 478 --targetCapacity; 479 480 sourceIndex=nextSourceIndex; 481 continue; 482 } 483 484 if(U16_IS_LEAD(c)) { 485 getTrail: 486 if(source<sourceLimit) { 487 /* test the following code unit */ 488 UChar trail=*source; 489 if(U16_IS_TRAIL(trail)) { 490 ++source; 491 ++nextSourceIndex; 492 c=U16_GET_SUPPLEMENTARY(c, trail); 493 } 494 } else { 495 /* no more input */ 496 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ 497 break; 498 } 499 } 500 501 /* 502 * all other Unicode code points c==U+0021..U+10ffff 503 * are encoded with the difference c-prev 504 * 505 * a new prev is computed from c, 506 * placed in the middle of a 0x80-block (for most small scripts) or 507 * in the middle of the Unihan and Hangul blocks 508 * to statistically minimize the following difference 509 */ 510 diff=c-prev; 511 prev=BOCU1_PREV(c); 512 if(DIFF_IS_SINGLE(diff)) { 513 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 514 *offsets++=sourceIndex; 515 --targetCapacity; 516 sourceIndex=nextSourceIndex; 517 if(c<0x3000) { 518 goto fastSingle; 519 } 520 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { 521 /* optimize 2-byte case */ 522 int32_t m; 523 524 if(diff>=0) { 525 diff-=BOCU1_REACH_POS_1+1; 526 m=diff%BOCU1_TRAIL_COUNT; 527 diff/=BOCU1_TRAIL_COUNT; 528 diff+=BOCU1_START_POS_2; 529 } else { 530 diff-=BOCU1_REACH_NEG_1; 531 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 532 diff+=BOCU1_START_NEG_2; 533 } 534 *target++=(uint8_t)diff; 535 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); 536 *offsets++=sourceIndex; 537 *offsets++=sourceIndex; 538 targetCapacity-=2; 539 sourceIndex=nextSourceIndex; 540 } else { 541 int32_t length; /* will be 2..4 */ 542 543 diff=packDiff(diff); 544 length=BOCU1_LENGTH_FROM_PACKED(diff); 545 546 /* write the output character bytes from diff and length */ 547 /* from the first if in the loop we know that targetCapacity>0 */ 548 if(length<=targetCapacity) { 549 switch(length) { 550 /* each branch falls through to the next one */ 551 case 4: 552 *target++=(uint8_t)(diff>>24); 553 *offsets++=sourceIndex; 554 case 3: /*fall through*/ 555 *target++=(uint8_t)(diff>>16); 556 *offsets++=sourceIndex; 557 case 2: /*fall through*/ 558 *target++=(uint8_t)(diff>>8); 559 *offsets++=sourceIndex; 560 /* case 1: handled above */ 561 *target++=(uint8_t)diff; 562 *offsets++=sourceIndex; 563 default: 564 /* will never occur */ 565 break; 566 } 567 targetCapacity-=length; 568 sourceIndex=nextSourceIndex; 569 } else { 570 uint8_t *charErrorBuffer; 571 572 /* 573 * We actually do this backwards here: 574 * In order to save an intermediate variable, we output 575 * first to the overflow buffer what does not fit into the 576 * regular target. 577 */ 578 /* we know that 1<=targetCapacity<length<=4 */ 579 length-=targetCapacity; 580 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 581 switch(length) { 582 /* each branch falls through to the next one */ 583 case 3: 584 *charErrorBuffer++=(uint8_t)(diff>>16); 585 case 2: /*fall through*/ 586 *charErrorBuffer++=(uint8_t)(diff>>8); 587 case 1: /*fall through*/ 588 *charErrorBuffer=(uint8_t)diff; 589 default: 590 /* will never occur */ 591 break; 592 } 593 cnv->charErrorBufferLength=(int8_t)length; 594 595 /* now output what fits into the regular target */ 596 diff>>=8*length; /* length was reduced by targetCapacity */ 597 switch(targetCapacity) { 598 /* each branch falls through to the next one */ 599 case 3: 600 *target++=(uint8_t)(diff>>16); 601 *offsets++=sourceIndex; 602 case 2: /*fall through*/ 603 *target++=(uint8_t)(diff>>8); 604 *offsets++=sourceIndex; 605 case 1: /*fall through*/ 606 *target++=(uint8_t)diff; 607 *offsets++=sourceIndex; 608 default: 609 /* will never occur */ 610 break; 611 } 612 613 /* target overflow */ 614 targetCapacity=0; 615 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 616 break; 617 } 618 } 619 } else { 620 /* target is full */ 621 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 622 break; 623 } 624 } 625 626 /* set the converter state back into UConverter */ 627 cnv->fromUChar32= c<0 ? -c : 0; 628 cnv->fromUnicodeStatus=(uint32_t)prev; 629 630 /* write back the updated pointers */ 631 pArgs->source=source; 632 pArgs->target=(char *)target; 633 pArgs->offsets=offsets; 634 } 635 636 /* 637 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling. 638 * If a change is made in the original function, then either 639 * change this function the same way or 640 * re-copy the original function and remove the variables 641 * offsets, sourceIndex, and nextSourceIndex. 642 */ 643 static void 644 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs, 645 UErrorCode *pErrorCode) { 646 UConverter *cnv; 647 const UChar *source, *sourceLimit; 648 uint8_t *target; 649 int32_t targetCapacity; 650 651 int32_t prev, c, diff; 652 653 /* set up the local pointers */ 654 cnv=pArgs->converter; 655 source=pArgs->source; 656 sourceLimit=pArgs->sourceLimit; 657 target=(uint8_t *)pArgs->target; 658 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 659 660 /* get the converter state from UConverter */ 661 c=cnv->fromUChar32; 662 prev=(int32_t)cnv->fromUnicodeStatus; 663 if(prev==0) { 664 prev=BOCU1_ASCII_PREV; 665 } 666 667 /* conversion loop */ 668 if(c!=0 && targetCapacity>0) { 669 goto getTrail; 670 } 671 672 fastSingle: 673 /* fast loop for single-byte differences */ 674 /* use only one loop counter variable, targetCapacity, not also source */ 675 diff=(int32_t)(sourceLimit-source); 676 if(targetCapacity>diff) { 677 targetCapacity=diff; 678 } 679 while(targetCapacity>0 && (c=*source)<0x3000) { 680 if(c<=0x20) { 681 if(c!=0x20) { 682 prev=BOCU1_ASCII_PREV; 683 } 684 *target++=(uint8_t)c; 685 } else { 686 diff=c-prev; 687 if(DIFF_IS_SINGLE(diff)) { 688 prev=BOCU1_SIMPLE_PREV(c); 689 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 690 } else { 691 break; 692 } 693 } 694 ++source; 695 --targetCapacity; 696 } 697 /* restore real values */ 698 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); 699 700 /* regular loop for all cases */ 701 while(source<sourceLimit) { 702 if(targetCapacity>0) { 703 c=*source++; 704 705 if(c<=0x20) { 706 /* 707 * ISO C0 control & space: 708 * Encode directly for MIME compatibility, 709 * and reset state except for space, to not disrupt compression. 710 */ 711 if(c!=0x20) { 712 prev=BOCU1_ASCII_PREV; 713 } 714 *target++=(uint8_t)c; 715 --targetCapacity; 716 continue; 717 } 718 719 if(U16_IS_LEAD(c)) { 720 getTrail: 721 if(source<sourceLimit) { 722 /* test the following code unit */ 723 UChar trail=*source; 724 if(U16_IS_TRAIL(trail)) { 725 ++source; 726 c=U16_GET_SUPPLEMENTARY(c, trail); 727 } 728 } else { 729 /* no more input */ 730 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ 731 break; 732 } 733 } 734 735 /* 736 * all other Unicode code points c==U+0021..U+10ffff 737 * are encoded with the difference c-prev 738 * 739 * a new prev is computed from c, 740 * placed in the middle of a 0x80-block (for most small scripts) or 741 * in the middle of the Unihan and Hangul blocks 742 * to statistically minimize the following difference 743 */ 744 diff=c-prev; 745 prev=BOCU1_PREV(c); 746 if(DIFF_IS_SINGLE(diff)) { 747 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 748 --targetCapacity; 749 if(c<0x3000) { 750 goto fastSingle; 751 } 752 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { 753 /* optimize 2-byte case */ 754 int32_t m; 755 756 if(diff>=0) { 757 diff-=BOCU1_REACH_POS_1+1; 758 m=diff%BOCU1_TRAIL_COUNT; 759 diff/=BOCU1_TRAIL_COUNT; 760 diff+=BOCU1_START_POS_2; 761 } else { 762 diff-=BOCU1_REACH_NEG_1; 763 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 764 diff+=BOCU1_START_NEG_2; 765 } 766 *target++=(uint8_t)diff; 767 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); 768 targetCapacity-=2; 769 } else { 770 int32_t length; /* will be 2..4 */ 771 772 diff=packDiff(diff); 773 length=BOCU1_LENGTH_FROM_PACKED(diff); 774 775 /* write the output character bytes from diff and length */ 776 /* from the first if in the loop we know that targetCapacity>0 */ 777 if(length<=targetCapacity) { 778 switch(length) { 779 /* each branch falls through to the next one */ 780 case 4: 781 *target++=(uint8_t)(diff>>24); 782 case 3: /*fall through*/ 783 *target++=(uint8_t)(diff>>16); 784 /* case 2: handled above */ 785 *target++=(uint8_t)(diff>>8); 786 /* case 1: handled above */ 787 *target++=(uint8_t)diff; 788 default: 789 /* will never occur */ 790 break; 791 } 792 targetCapacity-=length; 793 } else { 794 uint8_t *charErrorBuffer; 795 796 /* 797 * We actually do this backwards here: 798 * In order to save an intermediate variable, we output 799 * first to the overflow buffer what does not fit into the 800 * regular target. 801 */ 802 /* we know that 1<=targetCapacity<length<=4 */ 803 length-=targetCapacity; 804 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 805 switch(length) { 806 /* each branch falls through to the next one */ 807 case 3: 808 *charErrorBuffer++=(uint8_t)(diff>>16); 809 case 2: /*fall through*/ 810 *charErrorBuffer++=(uint8_t)(diff>>8); 811 case 1: /*fall through*/ 812 *charErrorBuffer=(uint8_t)diff; 813 default: 814 /* will never occur */ 815 break; 816 } 817 cnv->charErrorBufferLength=(int8_t)length; 818 819 /* now output what fits into the regular target */ 820 diff>>=8*length; /* length was reduced by targetCapacity */ 821 switch(targetCapacity) { 822 /* each branch falls through to the next one */ 823 case 3: 824 *target++=(uint8_t)(diff>>16); 825 case 2: /*fall through*/ 826 *target++=(uint8_t)(diff>>8); 827 case 1: /*fall through*/ 828 *target++=(uint8_t)diff; 829 default: 830 /* will never occur */ 831 break; 832 } 833 834 /* target overflow */ 835 targetCapacity=0; 836 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 837 break; 838 } 839 } 840 } else { 841 /* target is full */ 842 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 843 break; 844 } 845 } 846 847 /* set the converter state back into UConverter */ 848 cnv->fromUChar32= c<0 ? -c : 0; 849 cnv->fromUnicodeStatus=(uint32_t)prev; 850 851 /* write back the updated pointers */ 852 pArgs->source=source; 853 pArgs->target=(char *)target; 854 } 855 856 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */ 857 858 /** 859 * Function for BOCU-1 decoder; handles multi-byte lead bytes. 860 * 861 * @param b lead byte; 862 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD 863 * @return (diff<<2)|count 864 */ 865 static inline int32_t 866 decodeBocu1LeadByte(int32_t b) { 867 int32_t diff, count; 868 869 if(b>=BOCU1_START_NEG_2) { 870 /* positive difference */ 871 if(b<BOCU1_START_POS_3) { 872 /* two bytes */ 873 diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 874 count=1; 875 } else if(b<BOCU1_START_POS_4) { 876 /* three bytes */ 877 diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1; 878 count=2; 879 } else { 880 /* four bytes */ 881 diff=BOCU1_REACH_POS_3+1; 882 count=3; 883 } 884 } else { 885 /* negative difference */ 886 if(b>=BOCU1_START_NEG_3) { 887 /* two bytes */ 888 diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 889 count=1; 890 } else if(b>BOCU1_MIN) { 891 /* three bytes */ 892 diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2; 893 count=2; 894 } else { 895 /* four bytes */ 896 diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; 897 count=3; 898 } 899 } 900 901 /* return the state for decoding the trail byte(s) */ 902 return (diff<<2)|count; 903 } 904 905 /** 906 * Function for BOCU-1 decoder; handles multi-byte trail bytes. 907 * 908 * @param count number of remaining trail bytes including this one 909 * @param b trail byte 910 * @return new delta for diff including b - <0 indicates an error 911 * 912 * @see decodeBocu1 913 */ 914 static inline int32_t 915 decodeBocu1TrailByte(int32_t count, int32_t b) { 916 if(b<=0x20) { 917 /* skip some C0 controls and make the trail byte range contiguous */ 918 b=bocu1ByteToTrail[b]; 919 /* b<0 for an illegal trail byte value will result in return<0 below */ 920 #if BOCU1_MAX_TRAIL<0xff 921 } else if(b>BOCU1_MAX_TRAIL) { 922 return -99; 923 #endif 924 } else { 925 b-=BOCU1_TRAIL_BYTE_OFFSET; 926 } 927 928 /* add trail byte into difference and decrement count */ 929 if(count==1) { 930 return b; 931 } else if(count==2) { 932 return b*BOCU1_TRAIL_COUNT; 933 } else /* count==3 */ { 934 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); 935 } 936 } 937 938 static void 939 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 940 UErrorCode *pErrorCode) { 941 UConverter *cnv; 942 const uint8_t *source, *sourceLimit; 943 UChar *target; 944 const UChar *targetLimit; 945 int32_t *offsets; 946 947 int32_t prev, count, diff, c; 948 949 int8_t byteIndex; 950 uint8_t *bytes; 951 952 int32_t sourceIndex, nextSourceIndex; 953 954 /* set up the local pointers */ 955 cnv=pArgs->converter; 956 source=(const uint8_t *)pArgs->source; 957 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 958 target=pArgs->target; 959 targetLimit=pArgs->targetLimit; 960 offsets=pArgs->offsets; 961 962 /* get the converter state from UConverter */ 963 prev=(int32_t)cnv->toUnicodeStatus; 964 if(prev==0) { 965 prev=BOCU1_ASCII_PREV; 966 } 967 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ 968 count=diff&3; 969 diff>>=2; 970 971 byteIndex=cnv->toULength; 972 bytes=cnv->toUBytes; 973 974 /* sourceIndex=-1 if the current character began in the previous buffer */ 975 sourceIndex=byteIndex==0 ? 0 : -1; 976 nextSourceIndex=0; 977 978 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ 979 if(count>0 && byteIndex>0 && target<targetLimit) { 980 goto getTrail; 981 } 982 983 fastSingle: 984 /* fast loop for single-byte differences */ 985 /* use count as the only loop counter variable */ 986 diff=(int32_t)(sourceLimit-source); 987 count=(int32_t)(pArgs->targetLimit-target); 988 if(count>diff) { 989 count=diff; 990 } 991 while(count>0) { 992 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { 993 c=prev+(c-BOCU1_MIDDLE); 994 if(c<0x3000) { 995 *target++=(UChar)c; 996 *offsets++=nextSourceIndex++; 997 prev=BOCU1_SIMPLE_PREV(c); 998 } else { 999 break; 1000 } 1001 } else if(c<=0x20) { 1002 if(c!=0x20) { 1003 prev=BOCU1_ASCII_PREV; 1004 } 1005 *target++=(UChar)c; 1006 *offsets++=nextSourceIndex++; 1007 } else { 1008 break; 1009 } 1010 ++source; 1011 --count; 1012 } 1013 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ 1014 1015 /* decode a sequence of single and lead bytes */ 1016 while(source<sourceLimit) { 1017 if(target>=targetLimit) { 1018 /* target is full */ 1019 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1020 break; 1021 } 1022 1023 ++nextSourceIndex; 1024 c=*source++; 1025 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { 1026 /* Write a code point directly from a single-byte difference. */ 1027 c=prev+(c-BOCU1_MIDDLE); 1028 if(c<0x3000) { 1029 *target++=(UChar)c; 1030 *offsets++=sourceIndex; 1031 prev=BOCU1_SIMPLE_PREV(c); 1032 sourceIndex=nextSourceIndex; 1033 goto fastSingle; 1034 } 1035 } else if(c<=0x20) { 1036 /* 1037 * Direct-encoded C0 control code or space. 1038 * Reset prev for C0 control codes but not for space. 1039 */ 1040 if(c!=0x20) { 1041 prev=BOCU1_ASCII_PREV; 1042 } 1043 *target++=(UChar)c; 1044 *offsets++=sourceIndex; 1045 sourceIndex=nextSourceIndex; 1046 continue; 1047 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { 1048 /* Optimize two-byte case. */ 1049 if(c>=BOCU1_MIDDLE) { 1050 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 1051 } else { 1052 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 1053 } 1054 1055 /* trail byte */ 1056 ++nextSourceIndex; 1057 c=decodeBocu1TrailByte(1, *source++); 1058 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { 1059 bytes[0]=source[-2]; 1060 bytes[1]=source[-1]; 1061 byteIndex=2; 1062 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1063 break; 1064 } 1065 } else if(c==BOCU1_RESET) { 1066 /* only reset the state, no code point */ 1067 prev=BOCU1_ASCII_PREV; 1068 sourceIndex=nextSourceIndex; 1069 continue; 1070 } else { 1071 /* 1072 * For multi-byte difference lead bytes, set the decoder state 1073 * with the partial difference value from the lead byte and 1074 * with the number of trail bytes. 1075 */ 1076 bytes[0]=(uint8_t)c; 1077 byteIndex=1; 1078 1079 diff=decodeBocu1LeadByte(c); 1080 count=diff&3; 1081 diff>>=2; 1082 getTrail: 1083 for(;;) { 1084 if(source>=sourceLimit) { 1085 goto endloop; 1086 } 1087 ++nextSourceIndex; 1088 c=bytes[byteIndex++]=*source++; 1089 1090 /* trail byte in any position */ 1091 c=decodeBocu1TrailByte(count, c); 1092 if(c<0) { 1093 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1094 goto endloop; 1095 } 1096 1097 diff+=c; 1098 if(--count==0) { 1099 /* final trail byte, deliver a code point */ 1100 byteIndex=0; 1101 c=prev+diff; 1102 if((uint32_t)c>0x10ffff) { 1103 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1104 goto endloop; 1105 } 1106 break; 1107 } 1108 } 1109 } 1110 1111 /* calculate the next prev and output c */ 1112 prev=BOCU1_PREV(c); 1113 if(c<=0xffff) { 1114 *target++=(UChar)c; 1115 *offsets++=sourceIndex; 1116 } else { 1117 /* output surrogate pair */ 1118 *target++=U16_LEAD(c); 1119 if(target<targetLimit) { 1120 *target++=U16_TRAIL(c); 1121 *offsets++=sourceIndex; 1122 *offsets++=sourceIndex; 1123 } else { 1124 /* target overflow */ 1125 *offsets++=sourceIndex; 1126 cnv->UCharErrorBuffer[0]=U16_TRAIL(c); 1127 cnv->UCharErrorBufferLength=1; 1128 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1129 break; 1130 } 1131 } 1132 sourceIndex=nextSourceIndex; 1133 } 1134 endloop: 1135 1136 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { 1137 /* set the converter state in UConverter to deal with the next character */ 1138 cnv->toUnicodeStatus=BOCU1_ASCII_PREV; 1139 cnv->mode=0; 1140 } else { 1141 /* set the converter state back into UConverter */ 1142 cnv->toUnicodeStatus=(uint32_t)prev; 1143 cnv->mode=(diff<<2)|count; 1144 } 1145 cnv->toULength=byteIndex; 1146 1147 /* write back the updated pointers */ 1148 pArgs->source=(const char *)source; 1149 pArgs->target=target; 1150 pArgs->offsets=offsets; 1151 return; 1152 } 1153 1154 /* 1155 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling. 1156 * If a change is made in the original function, then either 1157 * change this function the same way or 1158 * re-copy the original function and remove the variables 1159 * offsets, sourceIndex, and nextSourceIndex. 1160 */ 1161 static void 1162 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs, 1163 UErrorCode *pErrorCode) { 1164 UConverter *cnv; 1165 const uint8_t *source, *sourceLimit; 1166 UChar *target; 1167 const UChar *targetLimit; 1168 1169 int32_t prev, count, diff, c; 1170 1171 int8_t byteIndex; 1172 uint8_t *bytes; 1173 1174 U_ALIGN_CODE(16) 1175 1176 /* set up the local pointers */ 1177 cnv=pArgs->converter; 1178 source=(const uint8_t *)pArgs->source; 1179 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1180 target=pArgs->target; 1181 targetLimit=pArgs->targetLimit; 1182 1183 /* get the converter state from UConverter */ 1184 prev=(int32_t)cnv->toUnicodeStatus; 1185 if(prev==0) { 1186 prev=BOCU1_ASCII_PREV; 1187 } 1188 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ 1189 count=diff&3; 1190 diff>>=2; 1191 1192 byteIndex=cnv->toULength; 1193 bytes=cnv->toUBytes; 1194 1195 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ 1196 if(count>0 && byteIndex>0 && target<targetLimit) { 1197 goto getTrail; 1198 } 1199 1200 fastSingle: 1201 /* fast loop for single-byte differences */ 1202 /* use count as the only loop counter variable */ 1203 diff=(int32_t)(sourceLimit-source); 1204 count=(int32_t)(pArgs->targetLimit-target); 1205 if(count>diff) { 1206 count=diff; 1207 } 1208 while(count>0) { 1209 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { 1210 c=prev+(c-BOCU1_MIDDLE); 1211 if(c<0x3000) { 1212 *target++=(UChar)c; 1213 prev=BOCU1_SIMPLE_PREV(c); 1214 } else { 1215 break; 1216 } 1217 } else if(c<=0x20) { 1218 if(c!=0x20) { 1219 prev=BOCU1_ASCII_PREV; 1220 } 1221 *target++=(UChar)c; 1222 } else { 1223 break; 1224 } 1225 ++source; 1226 --count; 1227 } 1228 1229 /* decode a sequence of single and lead bytes */ 1230 while(source<sourceLimit) { 1231 if(target>=targetLimit) { 1232 /* target is full */ 1233 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1234 break; 1235 } 1236 1237 c=*source++; 1238 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { 1239 /* Write a code point directly from a single-byte difference. */ 1240 c=prev+(c-BOCU1_MIDDLE); 1241 if(c<0x3000) { 1242 *target++=(UChar)c; 1243 prev=BOCU1_SIMPLE_PREV(c); 1244 goto fastSingle; 1245 } 1246 } else if(c<=0x20) { 1247 /* 1248 * Direct-encoded C0 control code or space. 1249 * Reset prev for C0 control codes but not for space. 1250 */ 1251 if(c!=0x20) { 1252 prev=BOCU1_ASCII_PREV; 1253 } 1254 *target++=(UChar)c; 1255 continue; 1256 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { 1257 /* Optimize two-byte case. */ 1258 if(c>=BOCU1_MIDDLE) { 1259 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 1260 } else { 1261 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 1262 } 1263 1264 /* trail byte */ 1265 c=decodeBocu1TrailByte(1, *source++); 1266 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { 1267 bytes[0]=source[-2]; 1268 bytes[1]=source[-1]; 1269 byteIndex=2; 1270 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1271 break; 1272 } 1273 } else if(c==BOCU1_RESET) { 1274 /* only reset the state, no code point */ 1275 prev=BOCU1_ASCII_PREV; 1276 continue; 1277 } else { 1278 /* 1279 * For multi-byte difference lead bytes, set the decoder state 1280 * with the partial difference value from the lead byte and 1281 * with the number of trail bytes. 1282 */ 1283 bytes[0]=(uint8_t)c; 1284 byteIndex=1; 1285 1286 diff=decodeBocu1LeadByte(c); 1287 count=diff&3; 1288 diff>>=2; 1289 getTrail: 1290 for(;;) { 1291 if(source>=sourceLimit) { 1292 goto endloop; 1293 } 1294 c=bytes[byteIndex++]=*source++; 1295 1296 /* trail byte in any position */ 1297 c=decodeBocu1TrailByte(count, c); 1298 if(c<0) { 1299 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1300 goto endloop; 1301 } 1302 1303 diff+=c; 1304 if(--count==0) { 1305 /* final trail byte, deliver a code point */ 1306 byteIndex=0; 1307 c=prev+diff; 1308 if((uint32_t)c>0x10ffff) { 1309 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1310 goto endloop; 1311 } 1312 break; 1313 } 1314 } 1315 } 1316 1317 /* calculate the next prev and output c */ 1318 prev=BOCU1_PREV(c); 1319 if(c<=0xffff) { 1320 *target++=(UChar)c; 1321 } else { 1322 /* output surrogate pair */ 1323 *target++=U16_LEAD(c); 1324 if(target<targetLimit) { 1325 *target++=U16_TRAIL(c); 1326 } else { 1327 /* target overflow */ 1328 cnv->UCharErrorBuffer[0]=U16_TRAIL(c); 1329 cnv->UCharErrorBufferLength=1; 1330 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1331 break; 1332 } 1333 } 1334 } 1335 endloop: 1336 1337 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { 1338 /* set the converter state in UConverter to deal with the next character */ 1339 cnv->toUnicodeStatus=BOCU1_ASCII_PREV; 1340 cnv->mode=0; 1341 } else { 1342 /* set the converter state back into UConverter */ 1343 cnv->toUnicodeStatus=(uint32_t)prev; 1344 cnv->mode=(diff<<2)|count; 1345 } 1346 cnv->toULength=byteIndex; 1347 1348 /* write back the updated pointers */ 1349 pArgs->source=(const char *)source; 1350 pArgs->target=target; 1351 return; 1352 } 1353 1354 /* miscellaneous ------------------------------------------------------------ */ 1355 1356 static const UConverterImpl _Bocu1Impl={ 1357 UCNV_BOCU1, 1358 1359 NULL, 1360 NULL, 1361 1362 NULL, 1363 NULL, 1364 NULL, 1365 1366 _Bocu1ToUnicode, 1367 _Bocu1ToUnicodeWithOffsets, 1368 _Bocu1FromUnicode, 1369 _Bocu1FromUnicodeWithOffsets, 1370 NULL, 1371 1372 NULL, 1373 NULL, 1374 NULL, 1375 NULL, 1376 ucnv_getCompleteUnicodeSet, 1377 1378 NULL, 1379 NULL 1380 }; 1381 1382 static const UConverterStaticData _Bocu1StaticData={ 1383 sizeof(UConverterStaticData), 1384 "BOCU-1", 1385 1214, /* CCSID for BOCU-1 */ 1386 UCNV_IBM, UCNV_BOCU1, 1387 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */ 1388 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */ 1389 FALSE, FALSE, 1390 0, 1391 0, 1392 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1393 }; 1394 1395 const UConverterSharedData _Bocu1Data={ 1396 sizeof(UConverterSharedData), ~((uint32_t)0), 1397 NULL, NULL, &_Bocu1StaticData, FALSE, &_Bocu1Impl, 1398 0, 1399 UCNV_MBCS_TABLE_INITIALIZER 1400 }; 1401 1402 #endif 1403