1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2002-2015, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * file name: ucnvbocu.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002mar27 14 * created by: Markus W. Scherer 15 * 16 * This is an implementation of the Binary Ordered Compression for Unicode, 17 * in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/ 18 */ 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION 23 24 #include "unicode/ucnv.h" 25 #include "unicode/ucnv_cb.h" 26 #include "unicode/utf16.h" 27 #include "putilimp.h" 28 #include "ucnv_bld.h" 29 #include "ucnv_cnv.h" 30 #include "uassert.h" 31 32 /* BOCU-1 constants and macros ---------------------------------------------- */ 33 34 /* 35 * BOCU-1 encodes the code points of a Unicode string as 36 * a sequence of byte-encoded differences (slope detection), 37 * preserving lexical order. 38 * 39 * Optimize the difference-taking for runs of Unicode text within 40 * small scripts: 41 * 42 * Most small scripts are allocated within aligned 128-blocks of Unicode 43 * code points. Lexical order is preserved if the "previous code point" state 44 * is always moved into the middle of such a block. 45 * 46 * Additionally, "prev" is moved from anywhere in the Unihan and Hangul 47 * areas into the middle of those areas. 48 * 49 * C0 control codes and space are encoded with their US-ASCII bytes. 50 * "prev" is reset for C0 controls but not for space. 51 */ 52 53 /* initial value for "prev": middle of the ASCII range */ 54 #define BOCU1_ASCII_PREV 0x40 55 56 /* bounding byte values for differences */ 57 #define BOCU1_MIN 0x21 58 #define BOCU1_MIDDLE 0x90 59 #define BOCU1_MAX_LEAD 0xfe 60 #define BOCU1_MAX_TRAIL 0xff 61 #define BOCU1_RESET 0xff 62 63 /* number of lead bytes */ 64 #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) 65 66 /* adjust trail byte counts for the use of some C0 control byte values */ 67 #define BOCU1_TRAIL_CONTROLS_COUNT 20 68 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) 69 70 /* number of trail bytes */ 71 #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) 72 73 /* 74 * number of positive and negative single-byte codes 75 * (counting 0==BOCU1_MIDDLE among the positive ones) 76 */ 77 #define BOCU1_SINGLE 64 78 79 /* number of lead bytes for positive and negative 2/3/4-byte sequences */ 80 #define BOCU1_LEAD_2 43 81 #define BOCU1_LEAD_3 3 82 #define BOCU1_LEAD_4 1 83 84 /* The difference value range for single-byters. */ 85 #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) 86 #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) 87 88 /* The difference value range for double-byters. */ 89 #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) 90 #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) 91 92 /* The difference value range for 3-byters. */ 93 #define BOCU1_REACH_POS_3 \ 94 (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) 95 96 #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) 97 98 /* The lead byte start values. */ 99 #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) 100 #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) 101 #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) 102 /* ==BOCU1_MAX_LEAD */ 103 104 #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) 105 #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) 106 #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) 107 /* ==BOCU1_MIN+1 */ 108 109 /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ 110 #define BOCU1_LENGTH_FROM_LEAD(lead) \ 111 ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \ 112 (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \ 113 (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4) 114 115 /* The length of a byte sequence, according to its packed form. */ 116 #define BOCU1_LENGTH_FROM_PACKED(packed) \ 117 ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4) 118 119 /* 120 * 12 commonly used C0 control codes (and space) are only used to encode 121 * themselves directly, 122 * which makes BOCU-1 MIME-usable and reasonably safe for 123 * ASCII-oriented software. 124 * 125 * These controls are 126 * 0 NUL 127 * 128 * 7 BEL 129 * 8 BS 130 * 131 * 9 TAB 132 * a LF 133 * b VT 134 * c FF 135 * d CR 136 * 137 * e SO 138 * f SI 139 * 140 * 1a SUB 141 * 1b ESC 142 * 143 * The other 20 C0 controls are also encoded directly (to preserve order) 144 * but are also used as trail bytes in difference encoding 145 * (for better compression). 146 */ 147 #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) 148 149 /* 150 * Byte value map for control codes, 151 * from external byte values 0x00..0x20 152 * to trail byte values 0..19 (0..0x13) as used in the difference calculation. 153 * External byte values that are illegal as trail bytes are mapped to -1. 154 */ 155 static const int8_t 156 bocu1ByteToTrail[BOCU1_MIN]={ 157 /* 0 1 2 3 4 5 6 7 */ 158 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, 159 160 /* 8 9 a b c d e f */ 161 -1, -1, -1, -1, -1, -1, -1, -1, 162 163 /* 10 11 12 13 14 15 16 17 */ 164 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 165 166 /* 18 19 1a 1b 1c 1d 1e 1f */ 167 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, 168 169 /* 20 */ 170 -1 171 }; 172 173 /* 174 * Byte value map for control codes, 175 * from trail byte values 0..19 (0..0x13) as used in the difference calculation 176 * to external byte values 0x00..0x20. 177 */ 178 static const int8_t 179 bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ 180 /* 0 1 2 3 4 5 6 7 */ 181 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, 182 183 /* 8 9 a b c d e f */ 184 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 185 186 /* 10 11 12 13 */ 187 0x1c, 0x1d, 0x1e, 0x1f 188 }; 189 190 /** 191 * Integer division and modulo with negative numerators 192 * yields negative modulo results and quotients that are one more than 193 * what we need here. 194 * This macro adjust the results so that the modulo-value m is always >=0. 195 * 196 * For positive n, the if() condition is always FALSE. 197 * 198 * @param n Number to be split into quotient and rest. 199 * Will be modified to contain the quotient. 200 * @param d Divisor. 201 * @param m Output variable for the rest (modulo result). 202 */ 203 #define NEGDIVMOD(n, d, m) { \ 204 (m)=(n)%(d); \ 205 (n)/=(d); \ 206 if((m)<0) { \ 207 --(n); \ 208 (m)+=(d); \ 209 } \ 210 } 211 212 /* Faster versions of packDiff() for single-byte-encoded diff values. */ 213 214 /** Is a diff value encodable in a single byte? */ 215 #define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1) 216 217 /** Encode a diff value in a single byte. */ 218 #define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff)) 219 220 /** Is a diff value encodable in two bytes? */ 221 #define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2) 222 223 /* BOCU-1 implementation functions ------------------------------------------ */ 224 225 #define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV) 226 227 /** 228 * Compute the next "previous" value for differencing 229 * from the current code point. 230 * 231 * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below) 232 * @return "previous code point" state value 233 */ 234 static inline int32_t 235 bocu1Prev(int32_t c) { 236 /* compute new prev */ 237 if(/* 0x3040<=c && */ c<=0x309f) { 238 /* Hiragana is not 128-aligned */ 239 return 0x3070; 240 } else if(0x4e00<=c && c<=0x9fa5) { 241 /* CJK Unihan */ 242 return 0x4e00-BOCU1_REACH_NEG_2; 243 } else if(0xac00<=c /* && c<=0xd7a3 */) { 244 /* Korean Hangul */ 245 return (0xd7a3+0xac00)/2; 246 } else { 247 /* mostly small scripts */ 248 return BOCU1_SIMPLE_PREV(c); 249 } 250 } 251 252 /** Fast version of bocu1Prev() for most scripts. */ 253 #define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c)) 254 255 /* 256 * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c. 257 * The UConverter fields are used as follows: 258 * 259 * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) 260 * 261 * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) 262 * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0) 263 */ 264 265 /* BOCU-1-from-Unicode conversion functions --------------------------------- */ 266 267 /** 268 * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes 269 * and return a packed integer with them. 270 * 271 * The encoding favors small absolute differences with short encodings 272 * to compress runs of same-script characters. 273 * 274 * Optimized version with unrolled loops and fewer floating-point operations 275 * than the standard packDiff(). 276 * 277 * @param diff difference value -0x10ffff..0x10ffff 278 * @return 279 * 0x010000zz for 1-byte sequence zz 280 * 0x0200yyzz for 2-byte sequence yy zz 281 * 0x03xxyyzz for 3-byte sequence xx yy zz 282 * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) 283 */ 284 static int32_t 285 packDiff(int32_t diff) { 286 int32_t result, m; 287 288 U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */ 289 if(diff>=BOCU1_REACH_NEG_1) { 290 /* mostly positive differences, and single-byte negative ones */ 291 #if 0 /* single-byte case handled in macros, see below */ 292 if(diff<=BOCU1_REACH_POS_1) { 293 /* single byte */ 294 return 0x01000000|(BOCU1_MIDDLE+diff); 295 } else 296 #endif 297 if(diff<=BOCU1_REACH_POS_2) { 298 /* two bytes */ 299 diff-=BOCU1_REACH_POS_1+1; 300 result=0x02000000; 301 302 m=diff%BOCU1_TRAIL_COUNT; 303 diff/=BOCU1_TRAIL_COUNT; 304 result|=BOCU1_TRAIL_TO_BYTE(m); 305 306 result|=(BOCU1_START_POS_2+diff)<<8; 307 } else if(diff<=BOCU1_REACH_POS_3) { 308 /* three bytes */ 309 diff-=BOCU1_REACH_POS_2+1; 310 result=0x03000000; 311 312 m=diff%BOCU1_TRAIL_COUNT; 313 diff/=BOCU1_TRAIL_COUNT; 314 result|=BOCU1_TRAIL_TO_BYTE(m); 315 316 m=diff%BOCU1_TRAIL_COUNT; 317 diff/=BOCU1_TRAIL_COUNT; 318 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 319 320 result|=(BOCU1_START_POS_3+diff)<<16; 321 } else { 322 /* four bytes */ 323 diff-=BOCU1_REACH_POS_3+1; 324 325 m=diff%BOCU1_TRAIL_COUNT; 326 diff/=BOCU1_TRAIL_COUNT; 327 result=BOCU1_TRAIL_TO_BYTE(m); 328 329 m=diff%BOCU1_TRAIL_COUNT; 330 diff/=BOCU1_TRAIL_COUNT; 331 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 332 333 /* 334 * We know that / and % would deliver quotient 0 and rest=diff. 335 * Avoid division and modulo for performance. 336 */ 337 result|=BOCU1_TRAIL_TO_BYTE(diff)<<16; 338 339 result|=((uint32_t)BOCU1_START_POS_4)<<24; 340 } 341 } else { 342 /* two- to four-byte negative differences */ 343 if(diff>=BOCU1_REACH_NEG_2) { 344 /* two bytes */ 345 diff-=BOCU1_REACH_NEG_1; 346 result=0x02000000; 347 348 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 349 result|=BOCU1_TRAIL_TO_BYTE(m); 350 351 result|=(BOCU1_START_NEG_2+diff)<<8; 352 } else if(diff>=BOCU1_REACH_NEG_3) { 353 /* three bytes */ 354 diff-=BOCU1_REACH_NEG_2; 355 result=0x03000000; 356 357 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 358 result|=BOCU1_TRAIL_TO_BYTE(m); 359 360 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 361 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 362 363 result|=(BOCU1_START_NEG_3+diff)<<16; 364 } else { 365 /* four bytes */ 366 diff-=BOCU1_REACH_NEG_3; 367 368 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 369 result=BOCU1_TRAIL_TO_BYTE(m); 370 371 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 372 result|=BOCU1_TRAIL_TO_BYTE(m)<<8; 373 374 /* 375 * We know that NEGDIVMOD would deliver 376 * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT. 377 * Avoid division and modulo for performance. 378 */ 379 m=diff+BOCU1_TRAIL_COUNT; 380 result|=BOCU1_TRAIL_TO_BYTE(m)<<16; 381 382 result|=BOCU1_MIN<<24; 383 } 384 } 385 return result; 386 } 387 388 389 static void 390 _Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 391 UErrorCode *pErrorCode) { 392 UConverter *cnv; 393 const UChar *source, *sourceLimit; 394 uint8_t *target; 395 int32_t targetCapacity; 396 int32_t *offsets; 397 398 int32_t prev, c, diff; 399 400 int32_t sourceIndex, nextSourceIndex; 401 402 /* set up the local pointers */ 403 cnv=pArgs->converter; 404 source=pArgs->source; 405 sourceLimit=pArgs->sourceLimit; 406 target=(uint8_t *)pArgs->target; 407 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 408 offsets=pArgs->offsets; 409 410 /* get the converter state from UConverter */ 411 c=cnv->fromUChar32; 412 prev=(int32_t)cnv->fromUnicodeStatus; 413 if(prev==0) { 414 prev=BOCU1_ASCII_PREV; 415 } 416 417 /* sourceIndex=-1 if the current character began in the previous buffer */ 418 sourceIndex= c==0 ? 0 : -1; 419 nextSourceIndex=0; 420 421 /* conversion loop */ 422 if(c!=0 && targetCapacity>0) { 423 goto getTrail; 424 } 425 426 fastSingle: 427 /* fast loop for single-byte differences */ 428 /* use only one loop counter variable, targetCapacity, not also source */ 429 diff=(int32_t)(sourceLimit-source); 430 if(targetCapacity>diff) { 431 targetCapacity=diff; 432 } 433 while(targetCapacity>0 && (c=*source)<0x3000) { 434 if(c<=0x20) { 435 if(c!=0x20) { 436 prev=BOCU1_ASCII_PREV; 437 } 438 *target++=(uint8_t)c; 439 *offsets++=nextSourceIndex++; 440 ++source; 441 --targetCapacity; 442 } else { 443 diff=c-prev; 444 if(DIFF_IS_SINGLE(diff)) { 445 prev=BOCU1_SIMPLE_PREV(c); 446 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 447 *offsets++=nextSourceIndex++; 448 ++source; 449 --targetCapacity; 450 } else { 451 break; 452 } 453 } 454 } 455 /* restore real values */ 456 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); 457 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ 458 459 /* regular loop for all cases */ 460 while(source<sourceLimit) { 461 if(targetCapacity>0) { 462 c=*source++; 463 ++nextSourceIndex; 464 465 if(c<=0x20) { 466 /* 467 * ISO C0 control & space: 468 * Encode directly for MIME compatibility, 469 * and reset state except for space, to not disrupt compression. 470 */ 471 if(c!=0x20) { 472 prev=BOCU1_ASCII_PREV; 473 } 474 *target++=(uint8_t)c; 475 *offsets++=sourceIndex; 476 --targetCapacity; 477 478 sourceIndex=nextSourceIndex; 479 continue; 480 } 481 482 if(U16_IS_LEAD(c)) { 483 getTrail: 484 if(source<sourceLimit) { 485 /* test the following code unit */ 486 UChar trail=*source; 487 if(U16_IS_TRAIL(trail)) { 488 ++source; 489 ++nextSourceIndex; 490 c=U16_GET_SUPPLEMENTARY(c, trail); 491 } 492 } else { 493 /* no more input */ 494 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ 495 break; 496 } 497 } 498 499 /* 500 * all other Unicode code points c==U+0021..U+10ffff 501 * are encoded with the difference c-prev 502 * 503 * a new prev is computed from c, 504 * placed in the middle of a 0x80-block (for most small scripts) or 505 * in the middle of the Unihan and Hangul blocks 506 * to statistically minimize the following difference 507 */ 508 diff=c-prev; 509 prev=BOCU1_PREV(c); 510 if(DIFF_IS_SINGLE(diff)) { 511 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 512 *offsets++=sourceIndex; 513 --targetCapacity; 514 sourceIndex=nextSourceIndex; 515 if(c<0x3000) { 516 goto fastSingle; 517 } 518 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { 519 /* optimize 2-byte case */ 520 int32_t m; 521 522 if(diff>=0) { 523 diff-=BOCU1_REACH_POS_1+1; 524 m=diff%BOCU1_TRAIL_COUNT; 525 diff/=BOCU1_TRAIL_COUNT; 526 diff+=BOCU1_START_POS_2; 527 } else { 528 diff-=BOCU1_REACH_NEG_1; 529 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 530 diff+=BOCU1_START_NEG_2; 531 } 532 *target++=(uint8_t)diff; 533 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); 534 *offsets++=sourceIndex; 535 *offsets++=sourceIndex; 536 targetCapacity-=2; 537 sourceIndex=nextSourceIndex; 538 } else { 539 int32_t length; /* will be 2..4 */ 540 541 diff=packDiff(diff); 542 length=BOCU1_LENGTH_FROM_PACKED(diff); 543 544 /* write the output character bytes from diff and length */ 545 /* from the first if in the loop we know that targetCapacity>0 */ 546 if(length<=targetCapacity) { 547 switch(length) { 548 /* each branch falls through to the next one */ 549 case 4: 550 *target++=(uint8_t)(diff>>24); 551 *offsets++=sourceIndex; 552 case 3: /*fall through*/ 553 *target++=(uint8_t)(diff>>16); 554 *offsets++=sourceIndex; 555 case 2: /*fall through*/ 556 *target++=(uint8_t)(diff>>8); 557 *offsets++=sourceIndex; 558 /* case 1: handled above */ 559 *target++=(uint8_t)diff; 560 *offsets++=sourceIndex; 561 default: 562 /* will never occur */ 563 break; 564 } 565 targetCapacity-=length; 566 sourceIndex=nextSourceIndex; 567 } else { 568 uint8_t *charErrorBuffer; 569 570 /* 571 * We actually do this backwards here: 572 * In order to save an intermediate variable, we output 573 * first to the overflow buffer what does not fit into the 574 * regular target. 575 */ 576 /* we know that 1<=targetCapacity<length<=4 */ 577 length-=targetCapacity; 578 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 579 switch(length) { 580 /* each branch falls through to the next one */ 581 case 3: 582 *charErrorBuffer++=(uint8_t)(diff>>16); 583 case 2: /*fall through*/ 584 *charErrorBuffer++=(uint8_t)(diff>>8); 585 case 1: /*fall through*/ 586 *charErrorBuffer=(uint8_t)diff; 587 default: 588 /* will never occur */ 589 break; 590 } 591 cnv->charErrorBufferLength=(int8_t)length; 592 593 /* now output what fits into the regular target */ 594 diff>>=8*length; /* length was reduced by targetCapacity */ 595 switch(targetCapacity) { 596 /* each branch falls through to the next one */ 597 case 3: 598 *target++=(uint8_t)(diff>>16); 599 *offsets++=sourceIndex; 600 case 2: /*fall through*/ 601 *target++=(uint8_t)(diff>>8); 602 *offsets++=sourceIndex; 603 case 1: /*fall through*/ 604 *target++=(uint8_t)diff; 605 *offsets++=sourceIndex; 606 default: 607 /* will never occur */ 608 break; 609 } 610 611 /* target overflow */ 612 targetCapacity=0; 613 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 614 break; 615 } 616 } 617 } else { 618 /* target is full */ 619 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 620 break; 621 } 622 } 623 624 /* set the converter state back into UConverter */ 625 cnv->fromUChar32= c<0 ? -c : 0; 626 cnv->fromUnicodeStatus=(uint32_t)prev; 627 628 /* write back the updated pointers */ 629 pArgs->source=source; 630 pArgs->target=(char *)target; 631 pArgs->offsets=offsets; 632 } 633 634 /* 635 * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling. 636 * If a change is made in the original function, then either 637 * change this function the same way or 638 * re-copy the original function and remove the variables 639 * offsets, sourceIndex, and nextSourceIndex. 640 */ 641 static void 642 _Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs, 643 UErrorCode *pErrorCode) { 644 UConverter *cnv; 645 const UChar *source, *sourceLimit; 646 uint8_t *target; 647 int32_t targetCapacity; 648 649 int32_t prev, c, diff; 650 651 /* set up the local pointers */ 652 cnv=pArgs->converter; 653 source=pArgs->source; 654 sourceLimit=pArgs->sourceLimit; 655 target=(uint8_t *)pArgs->target; 656 targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); 657 658 /* get the converter state from UConverter */ 659 c=cnv->fromUChar32; 660 prev=(int32_t)cnv->fromUnicodeStatus; 661 if(prev==0) { 662 prev=BOCU1_ASCII_PREV; 663 } 664 665 /* conversion loop */ 666 if(c!=0 && targetCapacity>0) { 667 goto getTrail; 668 } 669 670 fastSingle: 671 /* fast loop for single-byte differences */ 672 /* use only one loop counter variable, targetCapacity, not also source */ 673 diff=(int32_t)(sourceLimit-source); 674 if(targetCapacity>diff) { 675 targetCapacity=diff; 676 } 677 while(targetCapacity>0 && (c=*source)<0x3000) { 678 if(c<=0x20) { 679 if(c!=0x20) { 680 prev=BOCU1_ASCII_PREV; 681 } 682 *target++=(uint8_t)c; 683 } else { 684 diff=c-prev; 685 if(DIFF_IS_SINGLE(diff)) { 686 prev=BOCU1_SIMPLE_PREV(c); 687 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 688 } else { 689 break; 690 } 691 } 692 ++source; 693 --targetCapacity; 694 } 695 /* restore real values */ 696 targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); 697 698 /* regular loop for all cases */ 699 while(source<sourceLimit) { 700 if(targetCapacity>0) { 701 c=*source++; 702 703 if(c<=0x20) { 704 /* 705 * ISO C0 control & space: 706 * Encode directly for MIME compatibility, 707 * and reset state except for space, to not disrupt compression. 708 */ 709 if(c!=0x20) { 710 prev=BOCU1_ASCII_PREV; 711 } 712 *target++=(uint8_t)c; 713 --targetCapacity; 714 continue; 715 } 716 717 if(U16_IS_LEAD(c)) { 718 getTrail: 719 if(source<sourceLimit) { 720 /* test the following code unit */ 721 UChar trail=*source; 722 if(U16_IS_TRAIL(trail)) { 723 ++source; 724 c=U16_GET_SUPPLEMENTARY(c, trail); 725 } 726 } else { 727 /* no more input */ 728 c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ 729 break; 730 } 731 } 732 733 /* 734 * all other Unicode code points c==U+0021..U+10ffff 735 * are encoded with the difference c-prev 736 * 737 * a new prev is computed from c, 738 * placed in the middle of a 0x80-block (for most small scripts) or 739 * in the middle of the Unihan and Hangul blocks 740 * to statistically minimize the following difference 741 */ 742 diff=c-prev; 743 prev=BOCU1_PREV(c); 744 if(DIFF_IS_SINGLE(diff)) { 745 *target++=(uint8_t)PACK_SINGLE_DIFF(diff); 746 --targetCapacity; 747 if(c<0x3000) { 748 goto fastSingle; 749 } 750 } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { 751 /* optimize 2-byte case */ 752 int32_t m; 753 754 if(diff>=0) { 755 diff-=BOCU1_REACH_POS_1+1; 756 m=diff%BOCU1_TRAIL_COUNT; 757 diff/=BOCU1_TRAIL_COUNT; 758 diff+=BOCU1_START_POS_2; 759 } else { 760 diff-=BOCU1_REACH_NEG_1; 761 NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); 762 diff+=BOCU1_START_NEG_2; 763 } 764 *target++=(uint8_t)diff; 765 *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); 766 targetCapacity-=2; 767 } else { 768 int32_t length; /* will be 2..4 */ 769 770 diff=packDiff(diff); 771 length=BOCU1_LENGTH_FROM_PACKED(diff); 772 773 /* write the output character bytes from diff and length */ 774 /* from the first if in the loop we know that targetCapacity>0 */ 775 if(length<=targetCapacity) { 776 switch(length) { 777 /* each branch falls through to the next one */ 778 case 4: 779 *target++=(uint8_t)(diff>>24); 780 case 3: /*fall through*/ 781 *target++=(uint8_t)(diff>>16); 782 /* case 2: handled above */ 783 *target++=(uint8_t)(diff>>8); 784 /* case 1: handled above */ 785 *target++=(uint8_t)diff; 786 default: 787 /* will never occur */ 788 break; 789 } 790 targetCapacity-=length; 791 } else { 792 uint8_t *charErrorBuffer; 793 794 /* 795 * We actually do this backwards here: 796 * In order to save an intermediate variable, we output 797 * first to the overflow buffer what does not fit into the 798 * regular target. 799 */ 800 /* we know that 1<=targetCapacity<length<=4 */ 801 length-=targetCapacity; 802 charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; 803 switch(length) { 804 /* each branch falls through to the next one */ 805 case 3: 806 *charErrorBuffer++=(uint8_t)(diff>>16); 807 case 2: /*fall through*/ 808 *charErrorBuffer++=(uint8_t)(diff>>8); 809 case 1: /*fall through*/ 810 *charErrorBuffer=(uint8_t)diff; 811 default: 812 /* will never occur */ 813 break; 814 } 815 cnv->charErrorBufferLength=(int8_t)length; 816 817 /* now output what fits into the regular target */ 818 diff>>=8*length; /* length was reduced by targetCapacity */ 819 switch(targetCapacity) { 820 /* each branch falls through to the next one */ 821 case 3: 822 *target++=(uint8_t)(diff>>16); 823 case 2: /*fall through*/ 824 *target++=(uint8_t)(diff>>8); 825 case 1: /*fall through*/ 826 *target++=(uint8_t)diff; 827 default: 828 /* will never occur */ 829 break; 830 } 831 832 /* target overflow */ 833 targetCapacity=0; 834 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 835 break; 836 } 837 } 838 } else { 839 /* target is full */ 840 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 841 break; 842 } 843 } 844 845 /* set the converter state back into UConverter */ 846 cnv->fromUChar32= c<0 ? -c : 0; 847 cnv->fromUnicodeStatus=(uint32_t)prev; 848 849 /* write back the updated pointers */ 850 pArgs->source=source; 851 pArgs->target=(char *)target; 852 } 853 854 /* BOCU-1-to-Unicode conversion functions ----------------------------------- */ 855 856 /** 857 * Function for BOCU-1 decoder; handles multi-byte lead bytes. 858 * 859 * @param b lead byte; 860 * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD 861 * @return (diff<<2)|count 862 */ 863 static inline int32_t 864 decodeBocu1LeadByte(int32_t b) { 865 int32_t diff, count; 866 867 if(b>=BOCU1_START_NEG_2) { 868 /* positive difference */ 869 if(b<BOCU1_START_POS_3) { 870 /* two bytes */ 871 diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 872 count=1; 873 } else if(b<BOCU1_START_POS_4) { 874 /* three bytes */ 875 diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1; 876 count=2; 877 } else { 878 /* four bytes */ 879 diff=BOCU1_REACH_POS_3+1; 880 count=3; 881 } 882 } else { 883 /* negative difference */ 884 if(b>=BOCU1_START_NEG_3) { 885 /* two bytes */ 886 diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 887 count=1; 888 } else if(b>BOCU1_MIN) { 889 /* three bytes */ 890 diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2; 891 count=2; 892 } else { 893 /* four bytes */ 894 diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; 895 count=3; 896 } 897 } 898 899 /* return the state for decoding the trail byte(s) */ 900 return (diff<<2)|count; 901 } 902 903 /** 904 * Function for BOCU-1 decoder; handles multi-byte trail bytes. 905 * 906 * @param count number of remaining trail bytes including this one 907 * @param b trail byte 908 * @return new delta for diff including b - <0 indicates an error 909 * 910 * @see decodeBocu1 911 */ 912 static inline int32_t 913 decodeBocu1TrailByte(int32_t count, int32_t b) { 914 if(b<=0x20) { 915 /* skip some C0 controls and make the trail byte range contiguous */ 916 b=bocu1ByteToTrail[b]; 917 /* b<0 for an illegal trail byte value will result in return<0 below */ 918 #if BOCU1_MAX_TRAIL<0xff 919 } else if(b>BOCU1_MAX_TRAIL) { 920 return -99; 921 #endif 922 } else { 923 b-=BOCU1_TRAIL_BYTE_OFFSET; 924 } 925 926 /* add trail byte into difference and decrement count */ 927 if(count==1) { 928 return b; 929 } else if(count==2) { 930 return b*BOCU1_TRAIL_COUNT; 931 } else /* count==3 */ { 932 return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); 933 } 934 } 935 936 static void 937 _Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 938 UErrorCode *pErrorCode) { 939 UConverter *cnv; 940 const uint8_t *source, *sourceLimit; 941 UChar *target; 942 const UChar *targetLimit; 943 int32_t *offsets; 944 945 int32_t prev, count, diff, c; 946 947 int8_t byteIndex; 948 uint8_t *bytes; 949 950 int32_t sourceIndex, nextSourceIndex; 951 952 /* set up the local pointers */ 953 cnv=pArgs->converter; 954 source=(const uint8_t *)pArgs->source; 955 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 956 target=pArgs->target; 957 targetLimit=pArgs->targetLimit; 958 offsets=pArgs->offsets; 959 960 /* get the converter state from UConverter */ 961 prev=(int32_t)cnv->toUnicodeStatus; 962 if(prev==0) { 963 prev=BOCU1_ASCII_PREV; 964 } 965 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ 966 count=diff&3; 967 diff>>=2; 968 969 byteIndex=cnv->toULength; 970 bytes=cnv->toUBytes; 971 972 /* sourceIndex=-1 if the current character began in the previous buffer */ 973 sourceIndex=byteIndex==0 ? 0 : -1; 974 nextSourceIndex=0; 975 976 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ 977 if(count>0 && byteIndex>0 && target<targetLimit) { 978 goto getTrail; 979 } 980 981 fastSingle: 982 /* fast loop for single-byte differences */ 983 /* use count as the only loop counter variable */ 984 diff=(int32_t)(sourceLimit-source); 985 count=(int32_t)(pArgs->targetLimit-target); 986 if(count>diff) { 987 count=diff; 988 } 989 while(count>0) { 990 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { 991 c=prev+(c-BOCU1_MIDDLE); 992 if(c<0x3000) { 993 *target++=(UChar)c; 994 *offsets++=nextSourceIndex++; 995 prev=BOCU1_SIMPLE_PREV(c); 996 } else { 997 break; 998 } 999 } else if(c<=0x20) { 1000 if(c!=0x20) { 1001 prev=BOCU1_ASCII_PREV; 1002 } 1003 *target++=(UChar)c; 1004 *offsets++=nextSourceIndex++; 1005 } else { 1006 break; 1007 } 1008 ++source; 1009 --count; 1010 } 1011 sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ 1012 1013 /* decode a sequence of single and lead bytes */ 1014 while(source<sourceLimit) { 1015 if(target>=targetLimit) { 1016 /* target is full */ 1017 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1018 break; 1019 } 1020 1021 ++nextSourceIndex; 1022 c=*source++; 1023 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { 1024 /* Write a code point directly from a single-byte difference. */ 1025 c=prev+(c-BOCU1_MIDDLE); 1026 if(c<0x3000) { 1027 *target++=(UChar)c; 1028 *offsets++=sourceIndex; 1029 prev=BOCU1_SIMPLE_PREV(c); 1030 sourceIndex=nextSourceIndex; 1031 goto fastSingle; 1032 } 1033 } else if(c<=0x20) { 1034 /* 1035 * Direct-encoded C0 control code or space. 1036 * Reset prev for C0 control codes but not for space. 1037 */ 1038 if(c!=0x20) { 1039 prev=BOCU1_ASCII_PREV; 1040 } 1041 *target++=(UChar)c; 1042 *offsets++=sourceIndex; 1043 sourceIndex=nextSourceIndex; 1044 continue; 1045 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { 1046 /* Optimize two-byte case. */ 1047 if(c>=BOCU1_MIDDLE) { 1048 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 1049 } else { 1050 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 1051 } 1052 1053 /* trail byte */ 1054 ++nextSourceIndex; 1055 c=decodeBocu1TrailByte(1, *source++); 1056 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { 1057 bytes[0]=source[-2]; 1058 bytes[1]=source[-1]; 1059 byteIndex=2; 1060 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1061 break; 1062 } 1063 } else if(c==BOCU1_RESET) { 1064 /* only reset the state, no code point */ 1065 prev=BOCU1_ASCII_PREV; 1066 sourceIndex=nextSourceIndex; 1067 continue; 1068 } else { 1069 /* 1070 * For multi-byte difference lead bytes, set the decoder state 1071 * with the partial difference value from the lead byte and 1072 * with the number of trail bytes. 1073 */ 1074 bytes[0]=(uint8_t)c; 1075 byteIndex=1; 1076 1077 diff=decodeBocu1LeadByte(c); 1078 count=diff&3; 1079 diff>>=2; 1080 getTrail: 1081 for(;;) { 1082 if(source>=sourceLimit) { 1083 goto endloop; 1084 } 1085 ++nextSourceIndex; 1086 c=bytes[byteIndex++]=*source++; 1087 1088 /* trail byte in any position */ 1089 c=decodeBocu1TrailByte(count, c); 1090 if(c<0) { 1091 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1092 goto endloop; 1093 } 1094 1095 diff+=c; 1096 if(--count==0) { 1097 /* final trail byte, deliver a code point */ 1098 byteIndex=0; 1099 c=prev+diff; 1100 if((uint32_t)c>0x10ffff) { 1101 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1102 goto endloop; 1103 } 1104 break; 1105 } 1106 } 1107 } 1108 1109 /* calculate the next prev and output c */ 1110 prev=BOCU1_PREV(c); 1111 if(c<=0xffff) { 1112 *target++=(UChar)c; 1113 *offsets++=sourceIndex; 1114 } else { 1115 /* output surrogate pair */ 1116 *target++=U16_LEAD(c); 1117 if(target<targetLimit) { 1118 *target++=U16_TRAIL(c); 1119 *offsets++=sourceIndex; 1120 *offsets++=sourceIndex; 1121 } else { 1122 /* target overflow */ 1123 *offsets++=sourceIndex; 1124 cnv->UCharErrorBuffer[0]=U16_TRAIL(c); 1125 cnv->UCharErrorBufferLength=1; 1126 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1127 break; 1128 } 1129 } 1130 sourceIndex=nextSourceIndex; 1131 } 1132 endloop: 1133 1134 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { 1135 /* set the converter state in UConverter to deal with the next character */ 1136 cnv->toUnicodeStatus=BOCU1_ASCII_PREV; 1137 cnv->mode=0; 1138 } else { 1139 /* set the converter state back into UConverter */ 1140 cnv->toUnicodeStatus=(uint32_t)prev; 1141 cnv->mode=(diff<<2)|count; 1142 } 1143 cnv->toULength=byteIndex; 1144 1145 /* write back the updated pointers */ 1146 pArgs->source=(const char *)source; 1147 pArgs->target=target; 1148 pArgs->offsets=offsets; 1149 return; 1150 } 1151 1152 /* 1153 * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling. 1154 * If a change is made in the original function, then either 1155 * change this function the same way or 1156 * re-copy the original function and remove the variables 1157 * offsets, sourceIndex, and nextSourceIndex. 1158 */ 1159 static void 1160 _Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs, 1161 UErrorCode *pErrorCode) { 1162 UConverter *cnv; 1163 const uint8_t *source, *sourceLimit; 1164 UChar *target; 1165 const UChar *targetLimit; 1166 1167 int32_t prev, count, diff, c; 1168 1169 int8_t byteIndex; 1170 uint8_t *bytes; 1171 1172 /* set up the local pointers */ 1173 cnv=pArgs->converter; 1174 source=(const uint8_t *)pArgs->source; 1175 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1176 target=pArgs->target; 1177 targetLimit=pArgs->targetLimit; 1178 1179 /* get the converter state from UConverter */ 1180 prev=(int32_t)cnv->toUnicodeStatus; 1181 if(prev==0) { 1182 prev=BOCU1_ASCII_PREV; 1183 } 1184 diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ 1185 count=diff&3; 1186 diff>>=2; 1187 1188 byteIndex=cnv->toULength; 1189 bytes=cnv->toUBytes; 1190 1191 /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ 1192 if(count>0 && byteIndex>0 && target<targetLimit) { 1193 goto getTrail; 1194 } 1195 1196 fastSingle: 1197 /* fast loop for single-byte differences */ 1198 /* use count as the only loop counter variable */ 1199 diff=(int32_t)(sourceLimit-source); 1200 count=(int32_t)(pArgs->targetLimit-target); 1201 if(count>diff) { 1202 count=diff; 1203 } 1204 while(count>0) { 1205 if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { 1206 c=prev+(c-BOCU1_MIDDLE); 1207 if(c<0x3000) { 1208 *target++=(UChar)c; 1209 prev=BOCU1_SIMPLE_PREV(c); 1210 } else { 1211 break; 1212 } 1213 } else if(c<=0x20) { 1214 if(c!=0x20) { 1215 prev=BOCU1_ASCII_PREV; 1216 } 1217 *target++=(UChar)c; 1218 } else { 1219 break; 1220 } 1221 ++source; 1222 --count; 1223 } 1224 1225 /* decode a sequence of single and lead bytes */ 1226 while(source<sourceLimit) { 1227 if(target>=targetLimit) { 1228 /* target is full */ 1229 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1230 break; 1231 } 1232 1233 c=*source++; 1234 if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { 1235 /* Write a code point directly from a single-byte difference. */ 1236 c=prev+(c-BOCU1_MIDDLE); 1237 if(c<0x3000) { 1238 *target++=(UChar)c; 1239 prev=BOCU1_SIMPLE_PREV(c); 1240 goto fastSingle; 1241 } 1242 } else if(c<=0x20) { 1243 /* 1244 * Direct-encoded C0 control code or space. 1245 * Reset prev for C0 control codes but not for space. 1246 */ 1247 if(c!=0x20) { 1248 prev=BOCU1_ASCII_PREV; 1249 } 1250 *target++=(UChar)c; 1251 continue; 1252 } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { 1253 /* Optimize two-byte case. */ 1254 if(c>=BOCU1_MIDDLE) { 1255 diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; 1256 } else { 1257 diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; 1258 } 1259 1260 /* trail byte */ 1261 c=decodeBocu1TrailByte(1, *source++); 1262 if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { 1263 bytes[0]=source[-2]; 1264 bytes[1]=source[-1]; 1265 byteIndex=2; 1266 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1267 break; 1268 } 1269 } else if(c==BOCU1_RESET) { 1270 /* only reset the state, no code point */ 1271 prev=BOCU1_ASCII_PREV; 1272 continue; 1273 } else { 1274 /* 1275 * For multi-byte difference lead bytes, set the decoder state 1276 * with the partial difference value from the lead byte and 1277 * with the number of trail bytes. 1278 */ 1279 bytes[0]=(uint8_t)c; 1280 byteIndex=1; 1281 1282 diff=decodeBocu1LeadByte(c); 1283 count=diff&3; 1284 diff>>=2; 1285 getTrail: 1286 for(;;) { 1287 if(source>=sourceLimit) { 1288 goto endloop; 1289 } 1290 c=bytes[byteIndex++]=*source++; 1291 1292 /* trail byte in any position */ 1293 c=decodeBocu1TrailByte(count, c); 1294 if(c<0) { 1295 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1296 goto endloop; 1297 } 1298 1299 diff+=c; 1300 if(--count==0) { 1301 /* final trail byte, deliver a code point */ 1302 byteIndex=0; 1303 c=prev+diff; 1304 if((uint32_t)c>0x10ffff) { 1305 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1306 goto endloop; 1307 } 1308 break; 1309 } 1310 } 1311 } 1312 1313 /* calculate the next prev and output c */ 1314 prev=BOCU1_PREV(c); 1315 if(c<=0xffff) { 1316 *target++=(UChar)c; 1317 } else { 1318 /* output surrogate pair */ 1319 *target++=U16_LEAD(c); 1320 if(target<targetLimit) { 1321 *target++=U16_TRAIL(c); 1322 } else { 1323 /* target overflow */ 1324 cnv->UCharErrorBuffer[0]=U16_TRAIL(c); 1325 cnv->UCharErrorBufferLength=1; 1326 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1327 break; 1328 } 1329 } 1330 } 1331 endloop: 1332 1333 if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { 1334 /* set the converter state in UConverter to deal with the next character */ 1335 cnv->toUnicodeStatus=BOCU1_ASCII_PREV; 1336 cnv->mode=0; 1337 } else { 1338 /* set the converter state back into UConverter */ 1339 cnv->toUnicodeStatus=(uint32_t)prev; 1340 cnv->mode=(diff<<2)|count; 1341 } 1342 cnv->toULength=byteIndex; 1343 1344 /* write back the updated pointers */ 1345 pArgs->source=(const char *)source; 1346 pArgs->target=target; 1347 return; 1348 } 1349 1350 /* miscellaneous ------------------------------------------------------------ */ 1351 1352 static const UConverterImpl _Bocu1Impl={ 1353 UCNV_BOCU1, 1354 1355 NULL, 1356 NULL, 1357 1358 NULL, 1359 NULL, 1360 NULL, 1361 1362 _Bocu1ToUnicode, 1363 _Bocu1ToUnicodeWithOffsets, 1364 _Bocu1FromUnicode, 1365 _Bocu1FromUnicodeWithOffsets, 1366 NULL, 1367 1368 NULL, 1369 NULL, 1370 NULL, 1371 NULL, 1372 ucnv_getCompleteUnicodeSet, 1373 1374 NULL, 1375 NULL 1376 }; 1377 1378 static const UConverterStaticData _Bocu1StaticData={ 1379 sizeof(UConverterStaticData), 1380 "BOCU-1", 1381 1214, /* CCSID for BOCU-1 */ 1382 UCNV_IBM, UCNV_BOCU1, 1383 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */ 1384 { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */ 1385 FALSE, FALSE, 1386 0, 1387 0, 1388 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1389 }; 1390 1391 const UConverterSharedData _Bocu1Data= 1392 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl); 1393 1394 #endif 1395