1 /* 2 ********************************************************************** 3 * Copyright (C) 2002-2009, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: ucnv_u16.c 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2002jul01 12 * created by: Markus W. Scherer 13 * 14 * UTF-16 converter implementation. Used to be in ucnv_utf.c. 15 */ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_CONVERSION 20 21 #include "unicode/ucnv.h" 22 #include "ucnv_bld.h" 23 #include "ucnv_cnv.h" 24 #include "cmemory.h" 25 26 enum { 27 UCNV_NEED_TO_WRITE_BOM=1 28 }; 29 30 /* 31 * The UTF-16 toUnicode implementation is also used for the Java-specific 32 * "with BOM" variants of UTF-16BE and UTF-16LE. 33 */ 34 static void 35 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 36 UErrorCode *pErrorCode); 37 38 /* UTF-16BE ----------------------------------------------------------------- */ 39 40 #if U_IS_BIG_ENDIAN 41 # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets 42 #else 43 # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets 44 #endif 45 46 47 static void 48 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 49 UErrorCode *pErrorCode) { 50 UConverter *cnv; 51 const UChar *source; 52 char *target; 53 int32_t *offsets; 54 55 uint32_t targetCapacity, length, sourceIndex; 56 UChar c, trail; 57 char overflow[4]; 58 59 source=pArgs->source; 60 length=(int32_t)(pArgs->sourceLimit-source); 61 if(length<=0) { 62 /* no input, nothing to do */ 63 return; 64 } 65 66 cnv=pArgs->converter; 67 68 /* write the BOM if necessary */ 69 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 70 static const char bom[]={ (char)0xfe, (char)0xff }; 71 ucnv_fromUWriteBytes(cnv, 72 bom, 2, 73 &pArgs->target, pArgs->targetLimit, 74 &pArgs->offsets, -1, 75 pErrorCode); 76 cnv->fromUnicodeStatus=0; 77 } 78 79 target=pArgs->target; 80 if(target >= pArgs->targetLimit) { 81 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 82 return; 83 } 84 85 targetCapacity=(uint32_t)(pArgs->targetLimit-target); 86 offsets=pArgs->offsets; 87 sourceIndex=0; 88 89 /* c!=0 indicates in several places outside the main loops that a surrogate was found */ 90 91 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { 92 /* the last buffer ended with a lead surrogate, output the surrogate pair */ 93 ++source; 94 --length; 95 target[0]=(uint8_t)(c>>8); 96 target[1]=(uint8_t)c; 97 target[2]=(uint8_t)(trail>>8); 98 target[3]=(uint8_t)trail; 99 target+=4; 100 targetCapacity-=4; 101 if(offsets!=NULL) { 102 *offsets++=-1; 103 *offsets++=-1; 104 *offsets++=-1; 105 *offsets++=-1; 106 } 107 sourceIndex=1; 108 cnv->fromUChar32=c=0; 109 } 110 111 if(c==0) { 112 /* copy an even number of bytes for complete UChars */ 113 uint32_t count=2*length; 114 if(count>targetCapacity) { 115 count=targetCapacity&~1; 116 } 117 /* count is even */ 118 targetCapacity-=count; 119 count>>=1; 120 length-=count; 121 122 if(offsets==NULL) { 123 while(count>0) { 124 c=*source++; 125 if(U16_IS_SINGLE(c)) { 126 target[0]=(uint8_t)(c>>8); 127 target[1]=(uint8_t)c; 128 target+=2; 129 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { 130 ++source; 131 --count; 132 target[0]=(uint8_t)(c>>8); 133 target[1]=(uint8_t)c; 134 target[2]=(uint8_t)(trail>>8); 135 target[3]=(uint8_t)trail; 136 target+=4; 137 } else { 138 break; 139 } 140 --count; 141 } 142 } else { 143 while(count>0) { 144 c=*source++; 145 if(U16_IS_SINGLE(c)) { 146 target[0]=(uint8_t)(c>>8); 147 target[1]=(uint8_t)c; 148 target+=2; 149 *offsets++=sourceIndex; 150 *offsets++=sourceIndex++; 151 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { 152 ++source; 153 --count; 154 target[0]=(uint8_t)(c>>8); 155 target[1]=(uint8_t)c; 156 target[2]=(uint8_t)(trail>>8); 157 target[3]=(uint8_t)trail; 158 target+=4; 159 *offsets++=sourceIndex; 160 *offsets++=sourceIndex; 161 *offsets++=sourceIndex; 162 *offsets++=sourceIndex; 163 sourceIndex+=2; 164 } else { 165 break; 166 } 167 --count; 168 } 169 } 170 171 if(count==0) { 172 /* done with the loop for complete UChars */ 173 if(length>0 && targetCapacity>0) { 174 /* 175 * there is more input and some target capacity - 176 * it must be targetCapacity==1 because otherwise 177 * the above would have copied more; 178 * prepare for overflow output 179 */ 180 if(U16_IS_SINGLE(c=*source++)) { 181 overflow[0]=(char)(c>>8); 182 overflow[1]=(char)c; 183 length=2; /* 2 bytes to output */ 184 c=0; 185 /* } else { keep c for surrogate handling, length will be set there */ 186 } 187 } else { 188 length=0; 189 c=0; 190 } 191 } else { 192 /* keep c for surrogate handling, length will be set there */ 193 targetCapacity+=2*count; 194 } 195 } else { 196 length=0; /* from here on, length counts the bytes in overflow[] */ 197 } 198 199 if(c!=0) { 200 /* 201 * c is a surrogate, and 202 * - source or target too short 203 * - or the surrogate is unmatched 204 */ 205 length=0; 206 if(U16_IS_SURROGATE_LEAD(c)) { 207 if(source<pArgs->sourceLimit) { 208 if(U16_IS_TRAIL(trail=*source)) { 209 /* output the surrogate pair, will overflow (see conditions comment above) */ 210 ++source; 211 overflow[0]=(char)(c>>8); 212 overflow[1]=(char)c; 213 overflow[2]=(char)(trail>>8); 214 overflow[3]=(char)trail; 215 length=4; /* 4 bytes to output */ 216 c=0; 217 } else { 218 /* unmatched lead surrogate */ 219 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 220 } 221 } else { 222 /* see if the trail surrogate is in the next buffer */ 223 } 224 } else { 225 /* unmatched trail surrogate */ 226 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 227 } 228 cnv->fromUChar32=c; 229 } 230 231 if(length>0) { 232 /* output length bytes with overflow (length>targetCapacity>0) */ 233 ucnv_fromUWriteBytes(cnv, 234 overflow, length, 235 (char **)&target, pArgs->targetLimit, 236 &offsets, sourceIndex, 237 pErrorCode); 238 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); 239 } 240 241 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) { 242 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 243 } 244 245 /* write back the updated pointers */ 246 pArgs->source=source; 247 pArgs->target=(char *)target; 248 pArgs->offsets=offsets; 249 } 250 251 static void 252 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 253 UErrorCode *pErrorCode) { 254 UConverter *cnv; 255 const uint8_t *source; 256 UChar *target; 257 int32_t *offsets; 258 259 uint32_t targetCapacity, length, count, sourceIndex; 260 UChar c, trail; 261 262 if(pArgs->converter->mode<8) { 263 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); 264 return; 265 } 266 267 cnv=pArgs->converter; 268 source=(const uint8_t *)pArgs->source; 269 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); 270 if(length<=0 && cnv->toUnicodeStatus==0) { 271 /* no input, nothing to do */ 272 return; 273 } 274 275 target=pArgs->target; 276 if(target >= pArgs->targetLimit) { 277 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 278 return; 279 } 280 281 targetCapacity=(uint32_t)(pArgs->targetLimit-target); 282 offsets=pArgs->offsets; 283 sourceIndex=0; 284 c=0; 285 286 /* complete a partial UChar or pair from the last call */ 287 if(cnv->toUnicodeStatus!=0) { 288 /* 289 * special case: single byte from a previous buffer, 290 * where the byte turned out not to belong to a trail surrogate 291 * and the preceding, unmatched lead surrogate was put into toUBytes[] 292 * for error handling 293 */ 294 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; 295 cnv->toULength=1; 296 cnv->toUnicodeStatus=0; 297 } 298 if((count=cnv->toULength)!=0) { 299 uint8_t *p=cnv->toUBytes; 300 do { 301 p[count++]=*source++; 302 ++sourceIndex; 303 --length; 304 if(count==2) { 305 c=((UChar)p[0]<<8)|p[1]; 306 if(U16_IS_SINGLE(c)) { 307 /* output the BMP code point */ 308 *target++=c; 309 if(offsets!=NULL) { 310 *offsets++=-1; 311 } 312 --targetCapacity; 313 count=0; 314 c=0; 315 break; 316 } else if(U16_IS_SURROGATE_LEAD(c)) { 317 /* continue collecting bytes for the trail surrogate */ 318 c=0; /* avoid unnecessary surrogate handling below */ 319 } else { 320 /* fall through to error handling for an unmatched trail surrogate */ 321 break; 322 } 323 } else if(count==4) { 324 c=((UChar)p[0]<<8)|p[1]; 325 trail=((UChar)p[2]<<8)|p[3]; 326 if(U16_IS_TRAIL(trail)) { 327 /* output the surrogate pair */ 328 *target++=c; 329 if(targetCapacity>=2) { 330 *target++=trail; 331 if(offsets!=NULL) { 332 *offsets++=-1; 333 *offsets++=-1; 334 } 335 targetCapacity-=2; 336 } else /* targetCapacity==1 */ { 337 targetCapacity=0; 338 cnv->UCharErrorBuffer[0]=trail; 339 cnv->UCharErrorBufferLength=1; 340 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 341 } 342 count=0; 343 c=0; 344 break; 345 } else { 346 /* unmatched lead surrogate, handle here for consistent toUBytes[] */ 347 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 348 349 /* back out reading the code unit after it */ 350 if(((const uint8_t *)pArgs->source-source)>=2) { 351 source-=2; 352 } else { 353 /* 354 * if the trail unit's first byte was in a previous buffer, then 355 * we need to put it into a special place because toUBytes[] will be 356 * used for the lead unit's bytes 357 */ 358 cnv->toUnicodeStatus=0x100|p[2]; 359 --source; 360 } 361 cnv->toULength=2; 362 363 /* write back the updated pointers */ 364 pArgs->source=(const char *)source; 365 pArgs->target=target; 366 pArgs->offsets=offsets; 367 return; 368 } 369 } 370 } while(length>0); 371 cnv->toULength=(int8_t)count; 372 } 373 374 /* copy an even number of bytes for complete UChars */ 375 count=2*targetCapacity; 376 if(count>length) { 377 count=length&~1; 378 } 379 if(c==0 && count>0) { 380 length-=count; 381 count>>=1; 382 targetCapacity-=count; 383 if(offsets==NULL) { 384 do { 385 c=((UChar)source[0]<<8)|source[1]; 386 source+=2; 387 if(U16_IS_SINGLE(c)) { 388 *target++=c; 389 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && 390 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) 391 ) { 392 source+=2; 393 --count; 394 *target++=c; 395 *target++=trail; 396 } else { 397 break; 398 } 399 } while(--count>0); 400 } else { 401 do { 402 c=((UChar)source[0]<<8)|source[1]; 403 source+=2; 404 if(U16_IS_SINGLE(c)) { 405 *target++=c; 406 *offsets++=sourceIndex; 407 sourceIndex+=2; 408 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && 409 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) 410 ) { 411 source+=2; 412 --count; 413 *target++=c; 414 *target++=trail; 415 *offsets++=sourceIndex; 416 *offsets++=sourceIndex; 417 sourceIndex+=4; 418 } else { 419 break; 420 } 421 } while(--count>0); 422 } 423 424 if(count==0) { 425 /* done with the loop for complete UChars */ 426 c=0; 427 } else { 428 /* keep c for surrogate handling, trail will be set there */ 429 length+=2*(count-1); /* one more byte pair was consumed than count decremented */ 430 targetCapacity+=count; 431 } 432 } 433 434 if(c!=0) { 435 /* 436 * c is a surrogate, and 437 * - source or target too short 438 * - or the surrogate is unmatched 439 */ 440 cnv->toUBytes[0]=(uint8_t)(c>>8); 441 cnv->toUBytes[1]=(uint8_t)c; 442 cnv->toULength=2; 443 444 if(U16_IS_SURROGATE_LEAD(c)) { 445 if(length>=2) { 446 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) { 447 /* output the surrogate pair, will overflow (see conditions comment above) */ 448 source+=2; 449 length-=2; 450 *target++=c; 451 if(offsets!=NULL) { 452 *offsets++=sourceIndex; 453 } 454 cnv->UCharErrorBuffer[0]=trail; 455 cnv->UCharErrorBufferLength=1; 456 cnv->toULength=0; 457 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 458 } else { 459 /* unmatched lead surrogate */ 460 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 461 } 462 } else { 463 /* see if the trail surrogate is in the next buffer */ 464 } 465 } else { 466 /* unmatched trail surrogate */ 467 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 468 } 469 } 470 471 if(U_SUCCESS(*pErrorCode)) { 472 /* check for a remaining source byte */ 473 if(length>0) { 474 if(targetCapacity==0) { 475 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 476 } else { 477 /* it must be length==1 because otherwise the above would have copied more */ 478 cnv->toUBytes[cnv->toULength++]=*source++; 479 } 480 } 481 } 482 483 /* write back the updated pointers */ 484 pArgs->source=(const char *)source; 485 pArgs->target=target; 486 pArgs->offsets=offsets; 487 } 488 489 static UChar32 490 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { 491 const uint8_t *s, *sourceLimit; 492 UChar32 c; 493 494 if(pArgs->converter->mode<8) { 495 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 496 } 497 498 s=(const uint8_t *)pArgs->source; 499 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 500 501 if(s>=sourceLimit) { 502 /* no input */ 503 *err=U_INDEX_OUTOFBOUNDS_ERROR; 504 return 0xffff; 505 } 506 507 if(s+2>sourceLimit) { 508 /* only one byte: truncated UChar */ 509 pArgs->converter->toUBytes[0]=*s++; 510 pArgs->converter->toULength=1; 511 pArgs->source=(const char *)s; 512 *err = U_TRUNCATED_CHAR_FOUND; 513 return 0xffff; 514 } 515 516 /* get one UChar */ 517 c=((UChar32)*s<<8)|s[1]; 518 s+=2; 519 520 /* check for a surrogate pair */ 521 if(U_IS_SURROGATE(c)) { 522 if(U16_IS_SURROGATE_LEAD(c)) { 523 if(s+2<=sourceLimit) { 524 UChar trail; 525 526 /* get a second UChar and see if it is a trail surrogate */ 527 trail=((UChar)*s<<8)|s[1]; 528 if(U16_IS_TRAIL(trail)) { 529 c=U16_GET_SUPPLEMENTARY(c, trail); 530 s+=2; 531 } else { 532 /* unmatched lead surrogate */ 533 c=-2; 534 } 535 } else { 536 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ 537 uint8_t *bytes=pArgs->converter->toUBytes; 538 s-=2; 539 pArgs->converter->toULength=(int8_t)(sourceLimit-s); 540 do { 541 *bytes++=*s++; 542 } while(s<sourceLimit); 543 544 c=0xffff; 545 *err=U_TRUNCATED_CHAR_FOUND; 546 } 547 } else { 548 /* unmatched trail surrogate */ 549 c=-2; 550 } 551 552 if(c<0) { 553 /* write the unmatched surrogate */ 554 uint8_t *bytes=pArgs->converter->toUBytes; 555 pArgs->converter->toULength=2; 556 *bytes=*(s-2); 557 bytes[1]=*(s-1); 558 559 c=0xffff; 560 *err=U_ILLEGAL_CHAR_FOUND; 561 } 562 } 563 564 pArgs->source=(const char *)s; 565 return c; 566 } 567 568 static void 569 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) { 570 if(choice<=UCNV_RESET_TO_UNICODE) { 571 /* reset toUnicode state */ 572 if(UCNV_GET_VERSION(cnv)==0) { 573 cnv->mode=8; /* no BOM handling */ 574 } else { 575 cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */ 576 } 577 } 578 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { 579 /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */ 580 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; 581 } 582 } 583 584 static void 585 _UTF16BEOpen(UConverter *cnv, 586 UConverterLoadArgs *pArgs, 587 UErrorCode *pErrorCode) { 588 if(UCNV_GET_VERSION(cnv)<=1) { 589 _UTF16BEReset(cnv, UCNV_RESET_BOTH); 590 } else { 591 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 592 } 593 } 594 595 static const char * 596 _UTF16BEGetName(const UConverter *cnv) { 597 if(UCNV_GET_VERSION(cnv)==0) { 598 return "UTF-16BE"; 599 } else { 600 return "UTF-16BE,version=1"; 601 } 602 } 603 604 static const UConverterImpl _UTF16BEImpl={ 605 UCNV_UTF16_BigEndian, 606 607 NULL, 608 NULL, 609 610 _UTF16BEOpen, 611 NULL, 612 _UTF16BEReset, 613 614 _UTF16BEToUnicodeWithOffsets, 615 _UTF16BEToUnicodeWithOffsets, 616 _UTF16BEFromUnicodeWithOffsets, 617 _UTF16BEFromUnicodeWithOffsets, 618 _UTF16BEGetNextUChar, 619 620 NULL, 621 _UTF16BEGetName, 622 NULL, 623 NULL, 624 ucnv_getNonSurrogateUnicodeSet 625 }; 626 627 static const UConverterStaticData _UTF16BEStaticData={ 628 sizeof(UConverterStaticData), 629 "UTF-16BE", 630 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 4, 631 { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE, 632 0, 633 0, 634 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 635 }; 636 637 638 const UConverterSharedData _UTF16BEData={ 639 sizeof(UConverterSharedData), ~((uint32_t) 0), 640 NULL, NULL, &_UTF16BEStaticData, FALSE, &_UTF16BEImpl, 641 0 642 }; 643 644 /* UTF-16LE ----------------------------------------------------------------- */ 645 646 static void 647 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 648 UErrorCode *pErrorCode) { 649 UConverter *cnv; 650 const UChar *source; 651 char *target; 652 int32_t *offsets; 653 654 uint32_t targetCapacity, length, sourceIndex; 655 UChar c, trail; 656 char overflow[4]; 657 658 source=pArgs->source; 659 length=(int32_t)(pArgs->sourceLimit-source); 660 if(length<=0) { 661 /* no input, nothing to do */ 662 return; 663 } 664 665 cnv=pArgs->converter; 666 667 /* write the BOM if necessary */ 668 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 669 static const char bom[]={ (char)0xff, (char)0xfe }; 670 ucnv_fromUWriteBytes(cnv, 671 bom, 2, 672 &pArgs->target, pArgs->targetLimit, 673 &pArgs->offsets, -1, 674 pErrorCode); 675 cnv->fromUnicodeStatus=0; 676 } 677 678 target=pArgs->target; 679 if(target >= pArgs->targetLimit) { 680 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 681 return; 682 } 683 684 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); 685 offsets=pArgs->offsets; 686 sourceIndex=0; 687 688 /* c!=0 indicates in several places outside the main loops that a surrogate was found */ 689 690 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { 691 /* the last buffer ended with a lead surrogate, output the surrogate pair */ 692 ++source; 693 --length; 694 target[0]=(uint8_t)c; 695 target[1]=(uint8_t)(c>>8); 696 target[2]=(uint8_t)trail; 697 target[3]=(uint8_t)(trail>>8); 698 target+=4; 699 targetCapacity-=4; 700 if(offsets!=NULL) { 701 *offsets++=-1; 702 *offsets++=-1; 703 *offsets++=-1; 704 *offsets++=-1; 705 } 706 sourceIndex=1; 707 cnv->fromUChar32=c=0; 708 } 709 710 if(c==0) { 711 /* copy an even number of bytes for complete UChars */ 712 uint32_t count=2*length; 713 if(count>targetCapacity) { 714 count=targetCapacity&~1; 715 } 716 /* count is even */ 717 targetCapacity-=count; 718 count>>=1; 719 length-=count; 720 721 if(offsets==NULL) { 722 while(count>0) { 723 c=*source++; 724 if(U16_IS_SINGLE(c)) { 725 target[0]=(uint8_t)c; 726 target[1]=(uint8_t)(c>>8); 727 target+=2; 728 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { 729 ++source; 730 --count; 731 target[0]=(uint8_t)c; 732 target[1]=(uint8_t)(c>>8); 733 target[2]=(uint8_t)trail; 734 target[3]=(uint8_t)(trail>>8); 735 target+=4; 736 } else { 737 break; 738 } 739 --count; 740 } 741 } else { 742 while(count>0) { 743 c=*source++; 744 if(U16_IS_SINGLE(c)) { 745 target[0]=(uint8_t)c; 746 target[1]=(uint8_t)(c>>8); 747 target+=2; 748 *offsets++=sourceIndex; 749 *offsets++=sourceIndex++; 750 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { 751 ++source; 752 --count; 753 target[0]=(uint8_t)c; 754 target[1]=(uint8_t)(c>>8); 755 target[2]=(uint8_t)trail; 756 target[3]=(uint8_t)(trail>>8); 757 target+=4; 758 *offsets++=sourceIndex; 759 *offsets++=sourceIndex; 760 *offsets++=sourceIndex; 761 *offsets++=sourceIndex; 762 sourceIndex+=2; 763 } else { 764 break; 765 } 766 --count; 767 } 768 } 769 770 if(count==0) { 771 /* done with the loop for complete UChars */ 772 if(length>0 && targetCapacity>0) { 773 /* 774 * there is more input and some target capacity - 775 * it must be targetCapacity==1 because otherwise 776 * the above would have copied more; 777 * prepare for overflow output 778 */ 779 if(U16_IS_SINGLE(c=*source++)) { 780 overflow[0]=(char)c; 781 overflow[1]=(char)(c>>8); 782 length=2; /* 2 bytes to output */ 783 c=0; 784 /* } else { keep c for surrogate handling, length will be set there */ 785 } 786 } else { 787 length=0; 788 c=0; 789 } 790 } else { 791 /* keep c for surrogate handling, length will be set there */ 792 targetCapacity+=2*count; 793 } 794 } else { 795 length=0; /* from here on, length counts the bytes in overflow[] */ 796 } 797 798 if(c!=0) { 799 /* 800 * c is a surrogate, and 801 * - source or target too short 802 * - or the surrogate is unmatched 803 */ 804 length=0; 805 if(U16_IS_SURROGATE_LEAD(c)) { 806 if(source<pArgs->sourceLimit) { 807 if(U16_IS_TRAIL(trail=*source)) { 808 /* output the surrogate pair, will overflow (see conditions comment above) */ 809 ++source; 810 overflow[0]=(char)c; 811 overflow[1]=(char)(c>>8); 812 overflow[2]=(char)trail; 813 overflow[3]=(char)(trail>>8); 814 length=4; /* 4 bytes to output */ 815 c=0; 816 } else { 817 /* unmatched lead surrogate */ 818 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 819 } 820 } else { 821 /* see if the trail surrogate is in the next buffer */ 822 } 823 } else { 824 /* unmatched trail surrogate */ 825 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 826 } 827 cnv->fromUChar32=c; 828 } 829 830 if(length>0) { 831 /* output length bytes with overflow (length>targetCapacity>0) */ 832 ucnv_fromUWriteBytes(cnv, 833 overflow, length, 834 &target, pArgs->targetLimit, 835 &offsets, sourceIndex, 836 pErrorCode); 837 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); 838 } 839 840 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) { 841 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 842 } 843 844 /* write back the updated pointers */ 845 pArgs->source=source; 846 pArgs->target=target; 847 pArgs->offsets=offsets; 848 } 849 850 static void 851 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 852 UErrorCode *pErrorCode) { 853 UConverter *cnv; 854 const uint8_t *source; 855 UChar *target; 856 int32_t *offsets; 857 858 uint32_t targetCapacity, length, count, sourceIndex; 859 UChar c, trail; 860 861 if(pArgs->converter->mode<8) { 862 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); 863 return; 864 } 865 866 cnv=pArgs->converter; 867 source=(const uint8_t *)pArgs->source; 868 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); 869 if(length<=0 && cnv->toUnicodeStatus==0) { 870 /* no input, nothing to do */ 871 return; 872 } 873 874 target=pArgs->target; 875 if(target >= pArgs->targetLimit) { 876 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 877 return; 878 } 879 880 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); 881 offsets=pArgs->offsets; 882 sourceIndex=0; 883 c=0; 884 885 /* complete a partial UChar or pair from the last call */ 886 if(cnv->toUnicodeStatus!=0) { 887 /* 888 * special case: single byte from a previous buffer, 889 * where the byte turned out not to belong to a trail surrogate 890 * and the preceding, unmatched lead surrogate was put into toUBytes[] 891 * for error handling 892 */ 893 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; 894 cnv->toULength=1; 895 cnv->toUnicodeStatus=0; 896 } 897 if((count=cnv->toULength)!=0) { 898 uint8_t *p=cnv->toUBytes; 899 do { 900 p[count++]=*source++; 901 ++sourceIndex; 902 --length; 903 if(count==2) { 904 c=((UChar)p[1]<<8)|p[0]; 905 if(U16_IS_SINGLE(c)) { 906 /* output the BMP code point */ 907 *target++=c; 908 if(offsets!=NULL) { 909 *offsets++=-1; 910 } 911 --targetCapacity; 912 count=0; 913 c=0; 914 break; 915 } else if(U16_IS_SURROGATE_LEAD(c)) { 916 /* continue collecting bytes for the trail surrogate */ 917 c=0; /* avoid unnecessary surrogate handling below */ 918 } else { 919 /* fall through to error handling for an unmatched trail surrogate */ 920 break; 921 } 922 } else if(count==4) { 923 c=((UChar)p[1]<<8)|p[0]; 924 trail=((UChar)p[3]<<8)|p[2]; 925 if(U16_IS_TRAIL(trail)) { 926 /* output the surrogate pair */ 927 *target++=c; 928 if(targetCapacity>=2) { 929 *target++=trail; 930 if(offsets!=NULL) { 931 *offsets++=-1; 932 *offsets++=-1; 933 } 934 targetCapacity-=2; 935 } else /* targetCapacity==1 */ { 936 targetCapacity=0; 937 cnv->UCharErrorBuffer[0]=trail; 938 cnv->UCharErrorBufferLength=1; 939 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 940 } 941 count=0; 942 c=0; 943 break; 944 } else { 945 /* unmatched lead surrogate, handle here for consistent toUBytes[] */ 946 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 947 948 /* back out reading the code unit after it */ 949 if(((const uint8_t *)pArgs->source-source)>=2) { 950 source-=2; 951 } else { 952 /* 953 * if the trail unit's first byte was in a previous buffer, then 954 * we need to put it into a special place because toUBytes[] will be 955 * used for the lead unit's bytes 956 */ 957 cnv->toUnicodeStatus=0x100|p[2]; 958 --source; 959 } 960 cnv->toULength=2; 961 962 /* write back the updated pointers */ 963 pArgs->source=(const char *)source; 964 pArgs->target=target; 965 pArgs->offsets=offsets; 966 return; 967 } 968 } 969 } while(length>0); 970 cnv->toULength=(int8_t)count; 971 } 972 973 /* copy an even number of bytes for complete UChars */ 974 count=2*targetCapacity; 975 if(count>length) { 976 count=length&~1; 977 } 978 if(c==0 && count>0) { 979 length-=count; 980 count>>=1; 981 targetCapacity-=count; 982 if(offsets==NULL) { 983 do { 984 c=((UChar)source[1]<<8)|source[0]; 985 source+=2; 986 if(U16_IS_SINGLE(c)) { 987 *target++=c; 988 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && 989 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) 990 ) { 991 source+=2; 992 --count; 993 *target++=c; 994 *target++=trail; 995 } else { 996 break; 997 } 998 } while(--count>0); 999 } else { 1000 do { 1001 c=((UChar)source[1]<<8)|source[0]; 1002 source+=2; 1003 if(U16_IS_SINGLE(c)) { 1004 *target++=c; 1005 *offsets++=sourceIndex; 1006 sourceIndex+=2; 1007 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && 1008 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) 1009 ) { 1010 source+=2; 1011 --count; 1012 *target++=c; 1013 *target++=trail; 1014 *offsets++=sourceIndex; 1015 *offsets++=sourceIndex; 1016 sourceIndex+=4; 1017 } else { 1018 break; 1019 } 1020 } while(--count>0); 1021 } 1022 1023 if(count==0) { 1024 /* done with the loop for complete UChars */ 1025 c=0; 1026 } else { 1027 /* keep c for surrogate handling, trail will be set there */ 1028 length+=2*(count-1); /* one more byte pair was consumed than count decremented */ 1029 targetCapacity+=count; 1030 } 1031 } 1032 1033 if(c!=0) { 1034 /* 1035 * c is a surrogate, and 1036 * - source or target too short 1037 * - or the surrogate is unmatched 1038 */ 1039 cnv->toUBytes[0]=(uint8_t)c; 1040 cnv->toUBytes[1]=(uint8_t)(c>>8); 1041 cnv->toULength=2; 1042 1043 if(U16_IS_SURROGATE_LEAD(c)) { 1044 if(length>=2) { 1045 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) { 1046 /* output the surrogate pair, will overflow (see conditions comment above) */ 1047 source+=2; 1048 length-=2; 1049 *target++=c; 1050 if(offsets!=NULL) { 1051 *offsets++=sourceIndex; 1052 } 1053 cnv->UCharErrorBuffer[0]=trail; 1054 cnv->UCharErrorBufferLength=1; 1055 cnv->toULength=0; 1056 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1057 } else { 1058 /* unmatched lead surrogate */ 1059 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1060 } 1061 } else { 1062 /* see if the trail surrogate is in the next buffer */ 1063 } 1064 } else { 1065 /* unmatched trail surrogate */ 1066 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1067 } 1068 } 1069 1070 if(U_SUCCESS(*pErrorCode)) { 1071 /* check for a remaining source byte */ 1072 if(length>0) { 1073 if(targetCapacity==0) { 1074 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1075 } else { 1076 /* it must be length==1 because otherwise the above would have copied more */ 1077 cnv->toUBytes[cnv->toULength++]=*source++; 1078 } 1079 } 1080 } 1081 1082 /* write back the updated pointers */ 1083 pArgs->source=(const char *)source; 1084 pArgs->target=target; 1085 pArgs->offsets=offsets; 1086 } 1087 1088 static UChar32 1089 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { 1090 const uint8_t *s, *sourceLimit; 1091 UChar32 c; 1092 1093 if(pArgs->converter->mode<8) { 1094 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 1095 } 1096 1097 s=(const uint8_t *)pArgs->source; 1098 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1099 1100 if(s>=sourceLimit) { 1101 /* no input */ 1102 *err=U_INDEX_OUTOFBOUNDS_ERROR; 1103 return 0xffff; 1104 } 1105 1106 if(s+2>sourceLimit) { 1107 /* only one byte: truncated UChar */ 1108 pArgs->converter->toUBytes[0]=*s++; 1109 pArgs->converter->toULength=1; 1110 pArgs->source=(const char *)s; 1111 *err = U_TRUNCATED_CHAR_FOUND; 1112 return 0xffff; 1113 } 1114 1115 /* get one UChar */ 1116 c=((UChar32)s[1]<<8)|*s; 1117 s+=2; 1118 1119 /* check for a surrogate pair */ 1120 if(U_IS_SURROGATE(c)) { 1121 if(U16_IS_SURROGATE_LEAD(c)) { 1122 if(s+2<=sourceLimit) { 1123 UChar trail; 1124 1125 /* get a second UChar and see if it is a trail surrogate */ 1126 trail=((UChar)s[1]<<8)|*s; 1127 if(U16_IS_TRAIL(trail)) { 1128 c=U16_GET_SUPPLEMENTARY(c, trail); 1129 s+=2; 1130 } else { 1131 /* unmatched lead surrogate */ 1132 c=-2; 1133 } 1134 } else { 1135 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ 1136 uint8_t *bytes=pArgs->converter->toUBytes; 1137 s-=2; 1138 pArgs->converter->toULength=(int8_t)(sourceLimit-s); 1139 do { 1140 *bytes++=*s++; 1141 } while(s<sourceLimit); 1142 1143 c=0xffff; 1144 *err=U_TRUNCATED_CHAR_FOUND; 1145 } 1146 } else { 1147 /* unmatched trail surrogate */ 1148 c=-2; 1149 } 1150 1151 if(c<0) { 1152 /* write the unmatched surrogate */ 1153 uint8_t *bytes=pArgs->converter->toUBytes; 1154 pArgs->converter->toULength=2; 1155 *bytes=*(s-2); 1156 bytes[1]=*(s-1); 1157 1158 c=0xffff; 1159 *err=U_ILLEGAL_CHAR_FOUND; 1160 } 1161 } 1162 1163 pArgs->source=(const char *)s; 1164 return c; 1165 } 1166 1167 static void 1168 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) { 1169 if(choice<=UCNV_RESET_TO_UNICODE) { 1170 /* reset toUnicode state */ 1171 if(UCNV_GET_VERSION(cnv)==0) { 1172 cnv->mode=8; /* no BOM handling */ 1173 } else { 1174 cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */ 1175 } 1176 } 1177 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { 1178 /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */ 1179 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; 1180 } 1181 } 1182 1183 static void 1184 _UTF16LEOpen(UConverter *cnv, 1185 UConverterLoadArgs *pArgs, 1186 UErrorCode *pErrorCode) { 1187 if(UCNV_GET_VERSION(cnv)<=1) { 1188 _UTF16LEReset(cnv, UCNV_RESET_BOTH); 1189 } else { 1190 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1191 } 1192 } 1193 1194 static const char * 1195 _UTF16LEGetName(const UConverter *cnv) { 1196 if(UCNV_GET_VERSION(cnv)==0) { 1197 return "UTF-16LE"; 1198 } else { 1199 return "UTF-16LE,version=1"; 1200 } 1201 } 1202 1203 static const UConverterImpl _UTF16LEImpl={ 1204 UCNV_UTF16_LittleEndian, 1205 1206 NULL, 1207 NULL, 1208 1209 _UTF16LEOpen, 1210 NULL, 1211 _UTF16LEReset, 1212 1213 _UTF16LEToUnicodeWithOffsets, 1214 _UTF16LEToUnicodeWithOffsets, 1215 _UTF16LEFromUnicodeWithOffsets, 1216 _UTF16LEFromUnicodeWithOffsets, 1217 _UTF16LEGetNextUChar, 1218 1219 NULL, 1220 _UTF16LEGetName, 1221 NULL, 1222 NULL, 1223 ucnv_getNonSurrogateUnicodeSet 1224 }; 1225 1226 1227 static const UConverterStaticData _UTF16LEStaticData={ 1228 sizeof(UConverterStaticData), 1229 "UTF-16LE", 1230 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 4, 1231 { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE, 1232 0, 1233 0, 1234 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1235 }; 1236 1237 1238 const UConverterSharedData _UTF16LEData={ 1239 sizeof(UConverterSharedData), ~((uint32_t) 0), 1240 NULL, NULL, &_UTF16LEStaticData, FALSE, &_UTF16LEImpl, 1241 0 1242 }; 1243 1244 /* UTF-16 (Detect BOM) ------------------------------------------------------ */ 1245 1246 /* 1247 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE 1248 * accordingly. 1249 * This is a simpler version of the UTF-32 converter, with 1250 * fewer states for shorter BOMs. 1251 * 1252 * State values: 1253 * 0 initial state 1254 * 1 saw first byte 1255 * 2..5 - 1256 * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1 1257 * 8 UTF-16BE mode 1258 * 9 UTF-16LE mode 1259 * 1260 * During detection: state==number of initial bytes seen so far. 1261 * 1262 * On output, emit U+FEFF as the first code point. 1263 * 1264 * Variants: 1265 * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error. 1266 * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and 1267 * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error. 1268 */ 1269 1270 static void 1271 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) { 1272 if(choice<=UCNV_RESET_TO_UNICODE) { 1273 /* reset toUnicode: state=0 */ 1274 cnv->mode=0; 1275 } 1276 if(choice!=UCNV_RESET_TO_UNICODE) { 1277 /* reset fromUnicode: prepare to output the UTF-16PE BOM */ 1278 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; 1279 } 1280 } 1281 1282 static void 1283 _UTF16Open(UConverter *cnv, 1284 UConverterLoadArgs *pArgs, 1285 UErrorCode *pErrorCode) { 1286 if(UCNV_GET_VERSION(cnv)<=1) { 1287 _UTF16Reset(cnv, UCNV_RESET_BOTH); 1288 } else { 1289 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1290 } 1291 } 1292 1293 static const char * 1294 _UTF16GetName(const UConverter *cnv) { 1295 if(UCNV_GET_VERSION(cnv)==0) { 1296 return "UTF-16"; 1297 } else { 1298 return "UTF-16,version=1"; 1299 } 1300 } 1301 1302 const UConverterSharedData _UTF16Data; 1303 1304 #define IS_UTF16BE(cnv) ((cnv)->sharedData==&_UTF16BEData) 1305 #define IS_UTF16LE(cnv) ((cnv)->sharedData==&_UTF16LEData) 1306 #define IS_UTF16(cnv) ((cnv)->sharedData==&_UTF16Data) 1307 1308 static void 1309 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1310 UErrorCode *pErrorCode) { 1311 UConverter *cnv=pArgs->converter; 1312 const char *source=pArgs->source; 1313 const char *sourceLimit=pArgs->sourceLimit; 1314 int32_t *offsets=pArgs->offsets; 1315 1316 int32_t state, offsetDelta; 1317 uint8_t b; 1318 1319 state=cnv->mode; 1320 1321 /* 1322 * If we detect a BOM in this buffer, then we must add the BOM size to the 1323 * offsets because the actual converter function will not see and count the BOM. 1324 * offsetDelta will have the number of the BOM bytes that are in the current buffer. 1325 */ 1326 offsetDelta=0; 1327 1328 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { 1329 switch(state) { 1330 case 0: 1331 cnv->toUBytes[0]=(uint8_t)*source++; 1332 cnv->toULength=1; 1333 state=1; 1334 break; 1335 case 1: 1336 /* 1337 * Only inside this switch case can the state variable 1338 * temporarily take two additional values: 1339 * 6: BOM error, continue with BE 1340 * 7: BOM error, continue with LE 1341 */ 1342 b=*source; 1343 if(cnv->toUBytes[0]==0xfe && b==0xff) { 1344 if(IS_UTF16LE(cnv)) { 1345 state=7; /* illegal reverse BOM for Java "UnicodeLittle" */ 1346 } else { 1347 state=8; /* detect UTF-16BE */ 1348 } 1349 } else if(cnv->toUBytes[0]==0xff && b==0xfe) { 1350 if(IS_UTF16BE(cnv)) { 1351 state=6; /* illegal reverse BOM for Java "UnicodeBig" */ 1352 } else { 1353 state=9; /* detect UTF-16LE */ 1354 } 1355 } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) { 1356 state=6; /* illegal missing BOM for Java "Unicode" */ 1357 } 1358 if(state>=8) { 1359 /* BOM detected, consume it */ 1360 ++source; 1361 cnv->toULength=0; 1362 offsetDelta=(int32_t)(source-pArgs->source); 1363 } else if(state<6) { 1364 /* ok: no BOM, and not a reverse BOM */ 1365 if(source!=pArgs->source) { 1366 /* reset the source for a correct first offset */ 1367 source=pArgs->source; 1368 cnv->toULength=0; 1369 } 1370 if(IS_UTF16LE(cnv)) { 1371 /* Make Java "UnicodeLittle" default to LE. */ 1372 state=9; 1373 } else { 1374 /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */ 1375 state=8; 1376 } 1377 } else { 1378 /* 1379 * error: missing BOM, or reverse BOM 1380 * UTF-16,version=1: Java-specific "Unicode" requires a BOM. 1381 * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM. 1382 * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM. 1383 */ 1384 /* report the non-BOM or reverse BOM as an illegal sequence */ 1385 cnv->toUBytes[1]=b; 1386 cnv->toULength=2; 1387 pArgs->source=source+1; 1388 /* continue with conversion if the callback resets the error */ 1389 /* 1390 * Make Java "Unicode" default to BE like standard UTF-16. 1391 * Make Java "UnicodeBig" and "UnicodeLittle" default 1392 * to their normal endiannesses. 1393 */ 1394 cnv->mode=state+2; 1395 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE; 1396 return; 1397 } 1398 /* convert the rest of the stream */ 1399 cnv->mode=state; 1400 continue; 1401 case 8: 1402 /* call UTF-16BE */ 1403 pArgs->source=source; 1404 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); 1405 source=pArgs->source; 1406 break; 1407 case 9: 1408 /* call UTF-16LE */ 1409 pArgs->source=source; 1410 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); 1411 source=pArgs->source; 1412 break; 1413 default: 1414 break; /* does not occur */ 1415 } 1416 } 1417 1418 /* add BOM size to offsets - see comment at offsetDelta declaration */ 1419 if(offsets!=NULL && offsetDelta!=0) { 1420 int32_t *offsetsLimit=pArgs->offsets; 1421 while(offsets<offsetsLimit) { 1422 *offsets++ += offsetDelta; 1423 } 1424 } 1425 1426 pArgs->source=source; 1427 1428 if(source==sourceLimit && pArgs->flush) { 1429 /* handle truncated input */ 1430 switch(state) { 1431 case 0: 1432 break; /* no input at all, nothing to do */ 1433 case 8: 1434 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); 1435 break; 1436 case 9: 1437 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); 1438 break; 1439 default: 1440 /* 0<state<8: framework will report truncation, nothing to do here */ 1441 break; 1442 } 1443 } 1444 1445 cnv->mode=state; 1446 } 1447 1448 static UChar32 1449 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs, 1450 UErrorCode *pErrorCode) { 1451 switch(pArgs->converter->mode) { 1452 case 8: 1453 return _UTF16BEGetNextUChar(pArgs, pErrorCode); 1454 case 9: 1455 return _UTF16LEGetNextUChar(pArgs, pErrorCode); 1456 default: 1457 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 1458 } 1459 } 1460 1461 static const UConverterImpl _UTF16Impl = { 1462 UCNV_UTF16, 1463 1464 NULL, 1465 NULL, 1466 1467 _UTF16Open, 1468 NULL, 1469 _UTF16Reset, 1470 1471 _UTF16ToUnicodeWithOffsets, 1472 _UTF16ToUnicodeWithOffsets, 1473 _UTF16PEFromUnicodeWithOffsets, 1474 _UTF16PEFromUnicodeWithOffsets, 1475 _UTF16GetNextUChar, 1476 1477 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ 1478 _UTF16GetName, 1479 NULL, 1480 NULL, 1481 ucnv_getNonSurrogateUnicodeSet 1482 }; 1483 1484 static const UConverterStaticData _UTF16StaticData = { 1485 sizeof(UConverterStaticData), 1486 "UTF-16", 1487 1204, /* CCSID for BOM sensitive UTF-16 */ 1488 UCNV_IBM, UCNV_UTF16, 2, 4, 1489 #if U_IS_BIG_ENDIAN 1490 { 0xff, 0xfd, 0, 0 }, 2, 1491 #else 1492 { 0xfd, 0xff, 0, 0 }, 2, 1493 #endif 1494 FALSE, FALSE, 1495 0, 1496 0, 1497 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1498 }; 1499 1500 const UConverterSharedData _UTF16Data = { 1501 sizeof(UConverterSharedData), ~((uint32_t) 0), 1502 NULL, NULL, &_UTF16StaticData, FALSE, &_UTF16Impl, 1503 0 1504 }; 1505 1506 #endif 1507