1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2002-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * file name: ucnv_u16.c 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002jul01 14 * created by: Markus W. Scherer 15 * 16 * UTF-16 converter implementation. Used to be in ucnv_utf.c. 17 */ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_CONVERSION 22 23 #include "unicode/ucnv.h" 24 #include "unicode/uversion.h" 25 #include "ucnv_bld.h" 26 #include "ucnv_cnv.h" 27 #include "cmemory.h" 28 29 enum { 30 UCNV_NEED_TO_WRITE_BOM=1 31 }; 32 33 U_CDECL_BEGIN 34 /* 35 * The UTF-16 toUnicode implementation is also used for the Java-specific 36 * "with BOM" variants of UTF-16BE and UTF-16LE. 37 */ 38 static void U_CALLCONV 39 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 40 UErrorCode *pErrorCode); 41 42 /* UTF-16BE ----------------------------------------------------------------- */ 43 44 #if U_IS_BIG_ENDIAN 45 # define _UTF16PEFromUnicodeWithOffsets _UTF16BEFromUnicodeWithOffsets 46 #else 47 # define _UTF16PEFromUnicodeWithOffsets _UTF16LEFromUnicodeWithOffsets 48 #endif 49 50 51 static void U_CALLCONV 52 _UTF16BEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 53 UErrorCode *pErrorCode) { 54 UConverter *cnv; 55 const UChar *source; 56 char *target; 57 int32_t *offsets; 58 59 uint32_t targetCapacity, length, sourceIndex; 60 UChar c, trail; 61 char overflow[4]; 62 63 source=pArgs->source; 64 length=(int32_t)(pArgs->sourceLimit-source); 65 if(length<=0) { 66 /* no input, nothing to do */ 67 return; 68 } 69 70 cnv=pArgs->converter; 71 72 /* write the BOM if necessary */ 73 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 74 static const char bom[]={ (char)0xfe, (char)0xff }; 75 ucnv_fromUWriteBytes(cnv, 76 bom, 2, 77 &pArgs->target, pArgs->targetLimit, 78 &pArgs->offsets, -1, 79 pErrorCode); 80 cnv->fromUnicodeStatus=0; 81 } 82 83 target=pArgs->target; 84 if(target >= pArgs->targetLimit) { 85 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 86 return; 87 } 88 89 targetCapacity=(uint32_t)(pArgs->targetLimit-target); 90 offsets=pArgs->offsets; 91 sourceIndex=0; 92 93 /* c!=0 indicates in several places outside the main loops that a surrogate was found */ 94 95 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { 96 /* the last buffer ended with a lead surrogate, output the surrogate pair */ 97 ++source; 98 --length; 99 target[0]=(uint8_t)(c>>8); 100 target[1]=(uint8_t)c; 101 target[2]=(uint8_t)(trail>>8); 102 target[3]=(uint8_t)trail; 103 target+=4; 104 targetCapacity-=4; 105 if(offsets!=NULL) { 106 *offsets++=-1; 107 *offsets++=-1; 108 *offsets++=-1; 109 *offsets++=-1; 110 } 111 sourceIndex=1; 112 cnv->fromUChar32=c=0; 113 } 114 115 if(c==0) { 116 /* copy an even number of bytes for complete UChars */ 117 uint32_t count=2*length; 118 if(count>targetCapacity) { 119 count=targetCapacity&~1; 120 } 121 /* count is even */ 122 targetCapacity-=count; 123 count>>=1; 124 length-=count; 125 126 if(offsets==NULL) { 127 while(count>0) { 128 c=*source++; 129 if(U16_IS_SINGLE(c)) { 130 target[0]=(uint8_t)(c>>8); 131 target[1]=(uint8_t)c; 132 target+=2; 133 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { 134 ++source; 135 --count; 136 target[0]=(uint8_t)(c>>8); 137 target[1]=(uint8_t)c; 138 target[2]=(uint8_t)(trail>>8); 139 target[3]=(uint8_t)trail; 140 target+=4; 141 } else { 142 break; 143 } 144 --count; 145 } 146 } else { 147 while(count>0) { 148 c=*source++; 149 if(U16_IS_SINGLE(c)) { 150 target[0]=(uint8_t)(c>>8); 151 target[1]=(uint8_t)c; 152 target+=2; 153 *offsets++=sourceIndex; 154 *offsets++=sourceIndex++; 155 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { 156 ++source; 157 --count; 158 target[0]=(uint8_t)(c>>8); 159 target[1]=(uint8_t)c; 160 target[2]=(uint8_t)(trail>>8); 161 target[3]=(uint8_t)trail; 162 target+=4; 163 *offsets++=sourceIndex; 164 *offsets++=sourceIndex; 165 *offsets++=sourceIndex; 166 *offsets++=sourceIndex; 167 sourceIndex+=2; 168 } else { 169 break; 170 } 171 --count; 172 } 173 } 174 175 if(count==0) { 176 /* done with the loop for complete UChars */ 177 if(length>0 && targetCapacity>0) { 178 /* 179 * there is more input and some target capacity - 180 * it must be targetCapacity==1 because otherwise 181 * the above would have copied more; 182 * prepare for overflow output 183 */ 184 if(U16_IS_SINGLE(c=*source++)) { 185 overflow[0]=(char)(c>>8); 186 overflow[1]=(char)c; 187 length=2; /* 2 bytes to output */ 188 c=0; 189 /* } else { keep c for surrogate handling, length will be set there */ 190 } 191 } else { 192 length=0; 193 c=0; 194 } 195 } else { 196 /* keep c for surrogate handling, length will be set there */ 197 targetCapacity+=2*count; 198 } 199 } else { 200 length=0; /* from here on, length counts the bytes in overflow[] */ 201 } 202 203 if(c!=0) { 204 /* 205 * c is a surrogate, and 206 * - source or target too short 207 * - or the surrogate is unmatched 208 */ 209 length=0; 210 if(U16_IS_SURROGATE_LEAD(c)) { 211 if(source<pArgs->sourceLimit) { 212 if(U16_IS_TRAIL(trail=*source)) { 213 /* output the surrogate pair, will overflow (see conditions comment above) */ 214 ++source; 215 overflow[0]=(char)(c>>8); 216 overflow[1]=(char)c; 217 overflow[2]=(char)(trail>>8); 218 overflow[3]=(char)trail; 219 length=4; /* 4 bytes to output */ 220 c=0; 221 } else { 222 /* unmatched lead surrogate */ 223 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 224 } 225 } else { 226 /* see if the trail surrogate is in the next buffer */ 227 } 228 } else { 229 /* unmatched trail surrogate */ 230 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 231 } 232 cnv->fromUChar32=c; 233 } 234 235 if(length>0) { 236 /* output length bytes with overflow (length>targetCapacity>0) */ 237 ucnv_fromUWriteBytes(cnv, 238 overflow, length, 239 (char **)&target, pArgs->targetLimit, 240 &offsets, sourceIndex, 241 pErrorCode); 242 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); 243 } 244 245 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) { 246 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 247 } 248 249 /* write back the updated pointers */ 250 pArgs->source=source; 251 pArgs->target=(char *)target; 252 pArgs->offsets=offsets; 253 } 254 255 static void U_CALLCONV 256 _UTF16BEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 257 UErrorCode *pErrorCode) { 258 UConverter *cnv; 259 const uint8_t *source; 260 UChar *target; 261 int32_t *offsets; 262 263 uint32_t targetCapacity, length, count, sourceIndex; 264 UChar c, trail; 265 266 if(pArgs->converter->mode<8) { 267 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); 268 return; 269 } 270 271 cnv=pArgs->converter; 272 source=(const uint8_t *)pArgs->source; 273 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); 274 if(length<=0 && cnv->toUnicodeStatus==0) { 275 /* no input, nothing to do */ 276 return; 277 } 278 279 target=pArgs->target; 280 if(target >= pArgs->targetLimit) { 281 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 282 return; 283 } 284 285 targetCapacity=(uint32_t)(pArgs->targetLimit-target); 286 offsets=pArgs->offsets; 287 sourceIndex=0; 288 c=0; 289 290 /* complete a partial UChar or pair from the last call */ 291 if(cnv->toUnicodeStatus!=0) { 292 /* 293 * special case: single byte from a previous buffer, 294 * where the byte turned out not to belong to a trail surrogate 295 * and the preceding, unmatched lead surrogate was put into toUBytes[] 296 * for error handling 297 */ 298 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; 299 cnv->toULength=1; 300 cnv->toUnicodeStatus=0; 301 } 302 if((count=cnv->toULength)!=0) { 303 uint8_t *p=cnv->toUBytes; 304 do { 305 p[count++]=*source++; 306 ++sourceIndex; 307 --length; 308 if(count==2) { 309 c=((UChar)p[0]<<8)|p[1]; 310 if(U16_IS_SINGLE(c)) { 311 /* output the BMP code point */ 312 *target++=c; 313 if(offsets!=NULL) { 314 *offsets++=-1; 315 } 316 --targetCapacity; 317 count=0; 318 c=0; 319 break; 320 } else if(U16_IS_SURROGATE_LEAD(c)) { 321 /* continue collecting bytes for the trail surrogate */ 322 c=0; /* avoid unnecessary surrogate handling below */ 323 } else { 324 /* fall through to error handling for an unmatched trail surrogate */ 325 break; 326 } 327 } else if(count==4) { 328 c=((UChar)p[0]<<8)|p[1]; 329 trail=((UChar)p[2]<<8)|p[3]; 330 if(U16_IS_TRAIL(trail)) { 331 /* output the surrogate pair */ 332 *target++=c; 333 if(targetCapacity>=2) { 334 *target++=trail; 335 if(offsets!=NULL) { 336 *offsets++=-1; 337 *offsets++=-1; 338 } 339 targetCapacity-=2; 340 } else /* targetCapacity==1 */ { 341 targetCapacity=0; 342 cnv->UCharErrorBuffer[0]=trail; 343 cnv->UCharErrorBufferLength=1; 344 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 345 } 346 count=0; 347 c=0; 348 break; 349 } else { 350 /* unmatched lead surrogate, handle here for consistent toUBytes[] */ 351 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 352 353 /* back out reading the code unit after it */ 354 if(((const uint8_t *)pArgs->source-source)>=2) { 355 source-=2; 356 } else { 357 /* 358 * if the trail unit's first byte was in a previous buffer, then 359 * we need to put it into a special place because toUBytes[] will be 360 * used for the lead unit's bytes 361 */ 362 cnv->toUnicodeStatus=0x100|p[2]; 363 --source; 364 } 365 cnv->toULength=2; 366 367 /* write back the updated pointers */ 368 pArgs->source=(const char *)source; 369 pArgs->target=target; 370 pArgs->offsets=offsets; 371 return; 372 } 373 } 374 } while(length>0); 375 cnv->toULength=(int8_t)count; 376 } 377 378 /* copy an even number of bytes for complete UChars */ 379 count=2*targetCapacity; 380 if(count>length) { 381 count=length&~1; 382 } 383 if(c==0 && count>0) { 384 length-=count; 385 count>>=1; 386 targetCapacity-=count; 387 if(offsets==NULL) { 388 do { 389 c=((UChar)source[0]<<8)|source[1]; 390 source+=2; 391 if(U16_IS_SINGLE(c)) { 392 *target++=c; 393 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && 394 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) 395 ) { 396 source+=2; 397 --count; 398 *target++=c; 399 *target++=trail; 400 } else { 401 break; 402 } 403 } while(--count>0); 404 } else { 405 do { 406 c=((UChar)source[0]<<8)|source[1]; 407 source+=2; 408 if(U16_IS_SINGLE(c)) { 409 *target++=c; 410 *offsets++=sourceIndex; 411 sourceIndex+=2; 412 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && 413 U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1]) 414 ) { 415 source+=2; 416 --count; 417 *target++=c; 418 *target++=trail; 419 *offsets++=sourceIndex; 420 *offsets++=sourceIndex; 421 sourceIndex+=4; 422 } else { 423 break; 424 } 425 } while(--count>0); 426 } 427 428 if(count==0) { 429 /* done with the loop for complete UChars */ 430 c=0; 431 } else { 432 /* keep c for surrogate handling, trail will be set there */ 433 length+=2*(count-1); /* one more byte pair was consumed than count decremented */ 434 targetCapacity+=count; 435 } 436 } 437 438 if(c!=0) { 439 /* 440 * c is a surrogate, and 441 * - source or target too short 442 * - or the surrogate is unmatched 443 */ 444 cnv->toUBytes[0]=(uint8_t)(c>>8); 445 cnv->toUBytes[1]=(uint8_t)c; 446 cnv->toULength=2; 447 448 if(U16_IS_SURROGATE_LEAD(c)) { 449 if(length>=2) { 450 if(U16_IS_TRAIL(trail=((UChar)source[0]<<8)|source[1])) { 451 /* output the surrogate pair, will overflow (see conditions comment above) */ 452 source+=2; 453 length-=2; 454 *target++=c; 455 if(offsets!=NULL) { 456 *offsets++=sourceIndex; 457 } 458 cnv->UCharErrorBuffer[0]=trail; 459 cnv->UCharErrorBufferLength=1; 460 cnv->toULength=0; 461 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 462 } else { 463 /* unmatched lead surrogate */ 464 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 465 } 466 } else { 467 /* see if the trail surrogate is in the next buffer */ 468 } 469 } else { 470 /* unmatched trail surrogate */ 471 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 472 } 473 } 474 475 if(U_SUCCESS(*pErrorCode)) { 476 /* check for a remaining source byte */ 477 if(length>0) { 478 if(targetCapacity==0) { 479 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 480 } else { 481 /* it must be length==1 because otherwise the above would have copied more */ 482 cnv->toUBytes[cnv->toULength++]=*source++; 483 } 484 } 485 } 486 487 /* write back the updated pointers */ 488 pArgs->source=(const char *)source; 489 pArgs->target=target; 490 pArgs->offsets=offsets; 491 } 492 493 static UChar32 U_CALLCONV 494 _UTF16BEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { 495 const uint8_t *s, *sourceLimit; 496 UChar32 c; 497 498 if(pArgs->converter->mode<8) { 499 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 500 } 501 502 s=(const uint8_t *)pArgs->source; 503 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 504 505 if(s>=sourceLimit) { 506 /* no input */ 507 *err=U_INDEX_OUTOFBOUNDS_ERROR; 508 return 0xffff; 509 } 510 511 if(s+2>sourceLimit) { 512 /* only one byte: truncated UChar */ 513 pArgs->converter->toUBytes[0]=*s++; 514 pArgs->converter->toULength=1; 515 pArgs->source=(const char *)s; 516 *err = U_TRUNCATED_CHAR_FOUND; 517 return 0xffff; 518 } 519 520 /* get one UChar */ 521 c=((UChar32)*s<<8)|s[1]; 522 s+=2; 523 524 /* check for a surrogate pair */ 525 if(U_IS_SURROGATE(c)) { 526 if(U16_IS_SURROGATE_LEAD(c)) { 527 if(s+2<=sourceLimit) { 528 UChar trail; 529 530 /* get a second UChar and see if it is a trail surrogate */ 531 trail=((UChar)*s<<8)|s[1]; 532 if(U16_IS_TRAIL(trail)) { 533 c=U16_GET_SUPPLEMENTARY(c, trail); 534 s+=2; 535 } else { 536 /* unmatched lead surrogate */ 537 c=-2; 538 } 539 } else { 540 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ 541 uint8_t *bytes=pArgs->converter->toUBytes; 542 s-=2; 543 pArgs->converter->toULength=(int8_t)(sourceLimit-s); 544 do { 545 *bytes++=*s++; 546 } while(s<sourceLimit); 547 548 c=0xffff; 549 *err=U_TRUNCATED_CHAR_FOUND; 550 } 551 } else { 552 /* unmatched trail surrogate */ 553 c=-2; 554 } 555 556 if(c<0) { 557 /* write the unmatched surrogate */ 558 uint8_t *bytes=pArgs->converter->toUBytes; 559 pArgs->converter->toULength=2; 560 *bytes=*(s-2); 561 bytes[1]=*(s-1); 562 563 c=0xffff; 564 *err=U_ILLEGAL_CHAR_FOUND; 565 } 566 } 567 568 pArgs->source=(const char *)s; 569 return c; 570 } 571 572 static void U_CALLCONV 573 _UTF16BEReset(UConverter *cnv, UConverterResetChoice choice) { 574 if(choice<=UCNV_RESET_TO_UNICODE) { 575 /* reset toUnicode state */ 576 if(UCNV_GET_VERSION(cnv)==0) { 577 cnv->mode=8; /* no BOM handling */ 578 } else { 579 cnv->mode=0; /* Java-specific "UnicodeBig" requires BE BOM or no BOM */ 580 } 581 } 582 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { 583 /* reset fromUnicode for "UnicodeBig": prepare to output the UTF-16BE BOM */ 584 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; 585 } 586 } 587 588 static void U_CALLCONV 589 _UTF16BEOpen(UConverter *cnv, 590 UConverterLoadArgs *pArgs, 591 UErrorCode *pErrorCode) { 592 (void)pArgs; 593 if(UCNV_GET_VERSION(cnv)<=1) { 594 _UTF16BEReset(cnv, UCNV_RESET_BOTH); 595 } else { 596 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 597 } 598 } 599 600 static const char * U_CALLCONV 601 _UTF16BEGetName(const UConverter *cnv) { 602 if(UCNV_GET_VERSION(cnv)==0) { 603 return "UTF-16BE"; 604 } else { 605 return "UTF-16BE,version=1"; 606 } 607 } 608 U_CDECL_END 609 610 static const UConverterImpl _UTF16BEImpl={ 611 UCNV_UTF16_BigEndian, 612 613 NULL, 614 NULL, 615 616 _UTF16BEOpen, 617 NULL, 618 _UTF16BEReset, 619 620 _UTF16BEToUnicodeWithOffsets, 621 _UTF16BEToUnicodeWithOffsets, 622 _UTF16BEFromUnicodeWithOffsets, 623 _UTF16BEFromUnicodeWithOffsets, 624 _UTF16BEGetNextUChar, 625 626 NULL, 627 _UTF16BEGetName, 628 NULL, 629 NULL, 630 ucnv_getNonSurrogateUnicodeSet, 631 632 NULL, 633 NULL 634 }; 635 636 static const UConverterStaticData _UTF16BEStaticData={ 637 sizeof(UConverterStaticData), 638 "UTF-16BE", 639 1200, UCNV_IBM, UCNV_UTF16_BigEndian, 2, 2, 640 { 0xff, 0xfd, 0, 0 },2,FALSE,FALSE, 641 0, 642 0, 643 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 644 }; 645 646 647 const UConverterSharedData _UTF16BEData= 648 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16BEStaticData, &_UTF16BEImpl); 649 650 /* UTF-16LE ----------------------------------------------------------------- */ 651 U_CDECL_BEGIN 652 static void U_CALLCONV 653 _UTF16LEFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, 654 UErrorCode *pErrorCode) { 655 UConverter *cnv; 656 const UChar *source; 657 char *target; 658 int32_t *offsets; 659 660 uint32_t targetCapacity, length, sourceIndex; 661 UChar c, trail; 662 char overflow[4]; 663 664 source=pArgs->source; 665 length=(int32_t)(pArgs->sourceLimit-source); 666 if(length<=0) { 667 /* no input, nothing to do */ 668 return; 669 } 670 671 cnv=pArgs->converter; 672 673 /* write the BOM if necessary */ 674 if(cnv->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 675 static const char bom[]={ (char)0xff, (char)0xfe }; 676 ucnv_fromUWriteBytes(cnv, 677 bom, 2, 678 &pArgs->target, pArgs->targetLimit, 679 &pArgs->offsets, -1, 680 pErrorCode); 681 cnv->fromUnicodeStatus=0; 682 } 683 684 target=pArgs->target; 685 if(target >= pArgs->targetLimit) { 686 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 687 return; 688 } 689 690 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); 691 offsets=pArgs->offsets; 692 sourceIndex=0; 693 694 /* c!=0 indicates in several places outside the main loops that a surrogate was found */ 695 696 if((c=(UChar)cnv->fromUChar32)!=0 && U16_IS_TRAIL(trail=*source) && targetCapacity>=4) { 697 /* the last buffer ended with a lead surrogate, output the surrogate pair */ 698 ++source; 699 --length; 700 target[0]=(uint8_t)c; 701 target[1]=(uint8_t)(c>>8); 702 target[2]=(uint8_t)trail; 703 target[3]=(uint8_t)(trail>>8); 704 target+=4; 705 targetCapacity-=4; 706 if(offsets!=NULL) { 707 *offsets++=-1; 708 *offsets++=-1; 709 *offsets++=-1; 710 *offsets++=-1; 711 } 712 sourceIndex=1; 713 cnv->fromUChar32=c=0; 714 } 715 716 if(c==0) { 717 /* copy an even number of bytes for complete UChars */ 718 uint32_t count=2*length; 719 if(count>targetCapacity) { 720 count=targetCapacity&~1; 721 } 722 /* count is even */ 723 targetCapacity-=count; 724 count>>=1; 725 length-=count; 726 727 if(offsets==NULL) { 728 while(count>0) { 729 c=*source++; 730 if(U16_IS_SINGLE(c)) { 731 target[0]=(uint8_t)c; 732 target[1]=(uint8_t)(c>>8); 733 target+=2; 734 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { 735 ++source; 736 --count; 737 target[0]=(uint8_t)c; 738 target[1]=(uint8_t)(c>>8); 739 target[2]=(uint8_t)trail; 740 target[3]=(uint8_t)(trail>>8); 741 target+=4; 742 } else { 743 break; 744 } 745 --count; 746 } 747 } else { 748 while(count>0) { 749 c=*source++; 750 if(U16_IS_SINGLE(c)) { 751 target[0]=(uint8_t)c; 752 target[1]=(uint8_t)(c>>8); 753 target+=2; 754 *offsets++=sourceIndex; 755 *offsets++=sourceIndex++; 756 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && U16_IS_TRAIL(trail=*source)) { 757 ++source; 758 --count; 759 target[0]=(uint8_t)c; 760 target[1]=(uint8_t)(c>>8); 761 target[2]=(uint8_t)trail; 762 target[3]=(uint8_t)(trail>>8); 763 target+=4; 764 *offsets++=sourceIndex; 765 *offsets++=sourceIndex; 766 *offsets++=sourceIndex; 767 *offsets++=sourceIndex; 768 sourceIndex+=2; 769 } else { 770 break; 771 } 772 --count; 773 } 774 } 775 776 if(count==0) { 777 /* done with the loop for complete UChars */ 778 if(length>0 && targetCapacity>0) { 779 /* 780 * there is more input and some target capacity - 781 * it must be targetCapacity==1 because otherwise 782 * the above would have copied more; 783 * prepare for overflow output 784 */ 785 if(U16_IS_SINGLE(c=*source++)) { 786 overflow[0]=(char)c; 787 overflow[1]=(char)(c>>8); 788 length=2; /* 2 bytes to output */ 789 c=0; 790 /* } else { keep c for surrogate handling, length will be set there */ 791 } 792 } else { 793 length=0; 794 c=0; 795 } 796 } else { 797 /* keep c for surrogate handling, length will be set there */ 798 targetCapacity+=2*count; 799 } 800 } else { 801 length=0; /* from here on, length counts the bytes in overflow[] */ 802 } 803 804 if(c!=0) { 805 /* 806 * c is a surrogate, and 807 * - source or target too short 808 * - or the surrogate is unmatched 809 */ 810 length=0; 811 if(U16_IS_SURROGATE_LEAD(c)) { 812 if(source<pArgs->sourceLimit) { 813 if(U16_IS_TRAIL(trail=*source)) { 814 /* output the surrogate pair, will overflow (see conditions comment above) */ 815 ++source; 816 overflow[0]=(char)c; 817 overflow[1]=(char)(c>>8); 818 overflow[2]=(char)trail; 819 overflow[3]=(char)(trail>>8); 820 length=4; /* 4 bytes to output */ 821 c=0; 822 } else { 823 /* unmatched lead surrogate */ 824 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 825 } 826 } else { 827 /* see if the trail surrogate is in the next buffer */ 828 } 829 } else { 830 /* unmatched trail surrogate */ 831 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 832 } 833 cnv->fromUChar32=c; 834 } 835 836 if(length>0) { 837 /* output length bytes with overflow (length>targetCapacity>0) */ 838 ucnv_fromUWriteBytes(cnv, 839 overflow, length, 840 &target, pArgs->targetLimit, 841 &offsets, sourceIndex, 842 pErrorCode); 843 targetCapacity=(uint32_t)(pArgs->targetLimit-(char *)target); 844 } 845 846 if(U_SUCCESS(*pErrorCode) && source<pArgs->sourceLimit && targetCapacity==0) { 847 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 848 } 849 850 /* write back the updated pointers */ 851 pArgs->source=source; 852 pArgs->target=target; 853 pArgs->offsets=offsets; 854 } 855 856 static void U_CALLCONV 857 _UTF16LEToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 858 UErrorCode *pErrorCode) { 859 UConverter *cnv; 860 const uint8_t *source; 861 UChar *target; 862 int32_t *offsets; 863 864 uint32_t targetCapacity, length, count, sourceIndex; 865 UChar c, trail; 866 867 if(pArgs->converter->mode<8) { 868 _UTF16ToUnicodeWithOffsets(pArgs, pErrorCode); 869 return; 870 } 871 872 cnv=pArgs->converter; 873 source=(const uint8_t *)pArgs->source; 874 length=(int32_t)((const uint8_t *)pArgs->sourceLimit-source); 875 if(length<=0 && cnv->toUnicodeStatus==0) { 876 /* no input, nothing to do */ 877 return; 878 } 879 880 target=pArgs->target; 881 if(target >= pArgs->targetLimit) { 882 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 883 return; 884 } 885 886 targetCapacity=(uint32_t)(pArgs->targetLimit-pArgs->target); 887 offsets=pArgs->offsets; 888 sourceIndex=0; 889 c=0; 890 891 /* complete a partial UChar or pair from the last call */ 892 if(cnv->toUnicodeStatus!=0) { 893 /* 894 * special case: single byte from a previous buffer, 895 * where the byte turned out not to belong to a trail surrogate 896 * and the preceding, unmatched lead surrogate was put into toUBytes[] 897 * for error handling 898 */ 899 cnv->toUBytes[0]=(uint8_t)cnv->toUnicodeStatus; 900 cnv->toULength=1; 901 cnv->toUnicodeStatus=0; 902 } 903 if((count=cnv->toULength)!=0) { 904 uint8_t *p=cnv->toUBytes; 905 do { 906 p[count++]=*source++; 907 ++sourceIndex; 908 --length; 909 if(count==2) { 910 c=((UChar)p[1]<<8)|p[0]; 911 if(U16_IS_SINGLE(c)) { 912 /* output the BMP code point */ 913 *target++=c; 914 if(offsets!=NULL) { 915 *offsets++=-1; 916 } 917 --targetCapacity; 918 count=0; 919 c=0; 920 break; 921 } else if(U16_IS_SURROGATE_LEAD(c)) { 922 /* continue collecting bytes for the trail surrogate */ 923 c=0; /* avoid unnecessary surrogate handling below */ 924 } else { 925 /* fall through to error handling for an unmatched trail surrogate */ 926 break; 927 } 928 } else if(count==4) { 929 c=((UChar)p[1]<<8)|p[0]; 930 trail=((UChar)p[3]<<8)|p[2]; 931 if(U16_IS_TRAIL(trail)) { 932 /* output the surrogate pair */ 933 *target++=c; 934 if(targetCapacity>=2) { 935 *target++=trail; 936 if(offsets!=NULL) { 937 *offsets++=-1; 938 *offsets++=-1; 939 } 940 targetCapacity-=2; 941 } else /* targetCapacity==1 */ { 942 targetCapacity=0; 943 cnv->UCharErrorBuffer[0]=trail; 944 cnv->UCharErrorBufferLength=1; 945 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 946 } 947 count=0; 948 c=0; 949 break; 950 } else { 951 /* unmatched lead surrogate, handle here for consistent toUBytes[] */ 952 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 953 954 /* back out reading the code unit after it */ 955 if(((const uint8_t *)pArgs->source-source)>=2) { 956 source-=2; 957 } else { 958 /* 959 * if the trail unit's first byte was in a previous buffer, then 960 * we need to put it into a special place because toUBytes[] will be 961 * used for the lead unit's bytes 962 */ 963 cnv->toUnicodeStatus=0x100|p[2]; 964 --source; 965 } 966 cnv->toULength=2; 967 968 /* write back the updated pointers */ 969 pArgs->source=(const char *)source; 970 pArgs->target=target; 971 pArgs->offsets=offsets; 972 return; 973 } 974 } 975 } while(length>0); 976 cnv->toULength=(int8_t)count; 977 } 978 979 /* copy an even number of bytes for complete UChars */ 980 count=2*targetCapacity; 981 if(count>length) { 982 count=length&~1; 983 } 984 if(c==0 && count>0) { 985 length-=count; 986 count>>=1; 987 targetCapacity-=count; 988 if(offsets==NULL) { 989 do { 990 c=((UChar)source[1]<<8)|source[0]; 991 source+=2; 992 if(U16_IS_SINGLE(c)) { 993 *target++=c; 994 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && 995 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) 996 ) { 997 source+=2; 998 --count; 999 *target++=c; 1000 *target++=trail; 1001 } else { 1002 break; 1003 } 1004 } while(--count>0); 1005 } else { 1006 do { 1007 c=((UChar)source[1]<<8)|source[0]; 1008 source+=2; 1009 if(U16_IS_SINGLE(c)) { 1010 *target++=c; 1011 *offsets++=sourceIndex; 1012 sourceIndex+=2; 1013 } else if(U16_IS_SURROGATE_LEAD(c) && count>=2 && 1014 U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0]) 1015 ) { 1016 source+=2; 1017 --count; 1018 *target++=c; 1019 *target++=trail; 1020 *offsets++=sourceIndex; 1021 *offsets++=sourceIndex; 1022 sourceIndex+=4; 1023 } else { 1024 break; 1025 } 1026 } while(--count>0); 1027 } 1028 1029 if(count==0) { 1030 /* done with the loop for complete UChars */ 1031 c=0; 1032 } else { 1033 /* keep c for surrogate handling, trail will be set there */ 1034 length+=2*(count-1); /* one more byte pair was consumed than count decremented */ 1035 targetCapacity+=count; 1036 } 1037 } 1038 1039 if(c!=0) { 1040 /* 1041 * c is a surrogate, and 1042 * - source or target too short 1043 * - or the surrogate is unmatched 1044 */ 1045 cnv->toUBytes[0]=(uint8_t)c; 1046 cnv->toUBytes[1]=(uint8_t)(c>>8); 1047 cnv->toULength=2; 1048 1049 if(U16_IS_SURROGATE_LEAD(c)) { 1050 if(length>=2) { 1051 if(U16_IS_TRAIL(trail=((UChar)source[1]<<8)|source[0])) { 1052 /* output the surrogate pair, will overflow (see conditions comment above) */ 1053 source+=2; 1054 length-=2; 1055 *target++=c; 1056 if(offsets!=NULL) { 1057 *offsets++=sourceIndex; 1058 } 1059 cnv->UCharErrorBuffer[0]=trail; 1060 cnv->UCharErrorBufferLength=1; 1061 cnv->toULength=0; 1062 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1063 } else { 1064 /* unmatched lead surrogate */ 1065 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1066 } 1067 } else { 1068 /* see if the trail surrogate is in the next buffer */ 1069 } 1070 } else { 1071 /* unmatched trail surrogate */ 1072 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 1073 } 1074 } 1075 1076 if(U_SUCCESS(*pErrorCode)) { 1077 /* check for a remaining source byte */ 1078 if(length>0) { 1079 if(targetCapacity==0) { 1080 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 1081 } else { 1082 /* it must be length==1 because otherwise the above would have copied more */ 1083 cnv->toUBytes[cnv->toULength++]=*source++; 1084 } 1085 } 1086 } 1087 1088 /* write back the updated pointers */ 1089 pArgs->source=(const char *)source; 1090 pArgs->target=target; 1091 pArgs->offsets=offsets; 1092 } 1093 1094 static UChar32 U_CALLCONV 1095 _UTF16LEGetNextUChar(UConverterToUnicodeArgs *pArgs, UErrorCode *err) { 1096 const uint8_t *s, *sourceLimit; 1097 UChar32 c; 1098 1099 if(pArgs->converter->mode<8) { 1100 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 1101 } 1102 1103 s=(const uint8_t *)pArgs->source; 1104 sourceLimit=(const uint8_t *)pArgs->sourceLimit; 1105 1106 if(s>=sourceLimit) { 1107 /* no input */ 1108 *err=U_INDEX_OUTOFBOUNDS_ERROR; 1109 return 0xffff; 1110 } 1111 1112 if(s+2>sourceLimit) { 1113 /* only one byte: truncated UChar */ 1114 pArgs->converter->toUBytes[0]=*s++; 1115 pArgs->converter->toULength=1; 1116 pArgs->source=(const char *)s; 1117 *err = U_TRUNCATED_CHAR_FOUND; 1118 return 0xffff; 1119 } 1120 1121 /* get one UChar */ 1122 c=((UChar32)s[1]<<8)|*s; 1123 s+=2; 1124 1125 /* check for a surrogate pair */ 1126 if(U_IS_SURROGATE(c)) { 1127 if(U16_IS_SURROGATE_LEAD(c)) { 1128 if(s+2<=sourceLimit) { 1129 UChar trail; 1130 1131 /* get a second UChar and see if it is a trail surrogate */ 1132 trail=((UChar)s[1]<<8)|*s; 1133 if(U16_IS_TRAIL(trail)) { 1134 c=U16_GET_SUPPLEMENTARY(c, trail); 1135 s+=2; 1136 } else { 1137 /* unmatched lead surrogate */ 1138 c=-2; 1139 } 1140 } else { 1141 /* too few (2 or 3) bytes for a surrogate pair: truncated code point */ 1142 uint8_t *bytes=pArgs->converter->toUBytes; 1143 s-=2; 1144 pArgs->converter->toULength=(int8_t)(sourceLimit-s); 1145 do { 1146 *bytes++=*s++; 1147 } while(s<sourceLimit); 1148 1149 c=0xffff; 1150 *err=U_TRUNCATED_CHAR_FOUND; 1151 } 1152 } else { 1153 /* unmatched trail surrogate */ 1154 c=-2; 1155 } 1156 1157 if(c<0) { 1158 /* write the unmatched surrogate */ 1159 uint8_t *bytes=pArgs->converter->toUBytes; 1160 pArgs->converter->toULength=2; 1161 *bytes=*(s-2); 1162 bytes[1]=*(s-1); 1163 1164 c=0xffff; 1165 *err=U_ILLEGAL_CHAR_FOUND; 1166 } 1167 } 1168 1169 pArgs->source=(const char *)s; 1170 return c; 1171 } 1172 1173 static void U_CALLCONV 1174 _UTF16LEReset(UConverter *cnv, UConverterResetChoice choice) { 1175 if(choice<=UCNV_RESET_TO_UNICODE) { 1176 /* reset toUnicode state */ 1177 if(UCNV_GET_VERSION(cnv)==0) { 1178 cnv->mode=8; /* no BOM handling */ 1179 } else { 1180 cnv->mode=0; /* Java-specific "UnicodeLittle" requires LE BOM or no BOM */ 1181 } 1182 } 1183 if(choice!=UCNV_RESET_TO_UNICODE && UCNV_GET_VERSION(cnv)==1) { 1184 /* reset fromUnicode for "UnicodeLittle": prepare to output the UTF-16LE BOM */ 1185 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; 1186 } 1187 } 1188 1189 static void U_CALLCONV 1190 _UTF16LEOpen(UConverter *cnv, 1191 UConverterLoadArgs *pArgs, 1192 UErrorCode *pErrorCode) { 1193 (void)pArgs; 1194 if(UCNV_GET_VERSION(cnv)<=1) { 1195 _UTF16LEReset(cnv, UCNV_RESET_BOTH); 1196 } else { 1197 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1198 } 1199 } 1200 1201 static const char * U_CALLCONV 1202 _UTF16LEGetName(const UConverter *cnv) { 1203 if(UCNV_GET_VERSION(cnv)==0) { 1204 return "UTF-16LE"; 1205 } else { 1206 return "UTF-16LE,version=1"; 1207 } 1208 } 1209 U_CDECL_END 1210 1211 static const UConverterImpl _UTF16LEImpl={ 1212 UCNV_UTF16_LittleEndian, 1213 1214 NULL, 1215 NULL, 1216 1217 _UTF16LEOpen, 1218 NULL, 1219 _UTF16LEReset, 1220 1221 _UTF16LEToUnicodeWithOffsets, 1222 _UTF16LEToUnicodeWithOffsets, 1223 _UTF16LEFromUnicodeWithOffsets, 1224 _UTF16LEFromUnicodeWithOffsets, 1225 _UTF16LEGetNextUChar, 1226 1227 NULL, 1228 _UTF16LEGetName, 1229 NULL, 1230 NULL, 1231 ucnv_getNonSurrogateUnicodeSet, 1232 1233 NULL, 1234 NULL 1235 }; 1236 1237 1238 static const UConverterStaticData _UTF16LEStaticData={ 1239 sizeof(UConverterStaticData), 1240 "UTF-16LE", 1241 1202, UCNV_IBM, UCNV_UTF16_LittleEndian, 2, 2, 1242 { 0xfd, 0xff, 0, 0 },2,FALSE,FALSE, 1243 0, 1244 0, 1245 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1246 }; 1247 1248 1249 const UConverterSharedData _UTF16LEData= 1250 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16LEStaticData, &_UTF16LEImpl); 1251 1252 /* UTF-16 (Detect BOM) ------------------------------------------------------ */ 1253 1254 /* 1255 * Detect a BOM at the beginning of the stream and select UTF-16BE or UTF-16LE 1256 * accordingly. 1257 * This is a simpler version of the UTF-32 converter, with 1258 * fewer states for shorter BOMs. 1259 * 1260 * State values: 1261 * 0 initial state 1262 * 1 saw first byte 1263 * 2..5 - 1264 * 6..7 see _UTF16ToUnicodeWithOffsets() comments in state 1 1265 * 8 UTF-16BE mode 1266 * 9 UTF-16LE mode 1267 * 1268 * During detection: state==number of initial bytes seen so far. 1269 * 1270 * On output, emit U+FEFF as the first code point. 1271 * 1272 * Variants: 1273 * - UTF-16,version=1 (Java "Unicode" encoding) treats a missing BOM as an error. 1274 * - UTF-16BE,version=1 (Java "UnicodeBig" encoding) and 1275 * UTF-16LE,version=1 (Java "UnicodeLittle" encoding) treat a reverse BOM as an error. 1276 */ 1277 U_CDECL_BEGIN 1278 static void U_CALLCONV 1279 _UTF16Reset(UConverter *cnv, UConverterResetChoice choice) { 1280 if(choice<=UCNV_RESET_TO_UNICODE) { 1281 /* reset toUnicode: state=0 */ 1282 cnv->mode=0; 1283 } 1284 if(choice!=UCNV_RESET_TO_UNICODE) { 1285 /* reset fromUnicode: prepare to output the UTF-16PE BOM */ 1286 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; 1287 } 1288 } 1289 U_CDECL_END 1290 extern const UConverterSharedData _UTF16v2Data; 1291 U_CDECL_BEGIN 1292 static void U_CALLCONV 1293 _UTF16Open(UConverter *cnv, 1294 UConverterLoadArgs *pArgs, 1295 UErrorCode *pErrorCode) { 1296 if(UCNV_GET_VERSION(cnv)<=2) { 1297 if(UCNV_GET_VERSION(cnv)==2 && !pArgs->onlyTestIsLoadable) { 1298 /* 1299 * Switch implementation, and switch the staticData that's different 1300 * and was copied into the UConverter. 1301 * (See ucnv_createConverterFromSharedData() in ucnv_bld.c.) 1302 * UTF-16,version=2 fromUnicode() always writes a big-endian byte stream. 1303 */ 1304 cnv->sharedData=(UConverterSharedData*)&_UTF16v2Data; 1305 uprv_memcpy(cnv->subChars, _UTF16v2Data.staticData->subChar, UCNV_MAX_SUBCHAR_LEN); 1306 } 1307 _UTF16Reset(cnv, UCNV_RESET_BOTH); 1308 } else { 1309 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 1310 } 1311 } 1312 1313 static const char * U_CALLCONV 1314 _UTF16GetName(const UConverter *cnv) { 1315 if(UCNV_GET_VERSION(cnv)==0) { 1316 return "UTF-16"; 1317 } else if(UCNV_GET_VERSION(cnv)==1) { 1318 return "UTF-16,version=1"; 1319 } else { 1320 return "UTF-16,version=2"; 1321 } 1322 } 1323 U_CDECL_END 1324 extern const UConverterSharedData _UTF16Data; 1325 1326 static inline bool IS_UTF16BE(const UConverter *cnv) { 1327 return ((cnv)->sharedData == &_UTF16BEData); 1328 } 1329 1330 static inline bool IS_UTF16LE(const UConverter *cnv) { 1331 return ((cnv)->sharedData == &_UTF16LEData); 1332 } 1333 1334 static inline bool IS_UTF16(const UConverter *cnv) { 1335 return ((cnv)->sharedData==&_UTF16Data) || ((cnv)->sharedData == &_UTF16v2Data); 1336 } 1337 1338 U_CDECL_BEGIN 1339 static void U_CALLCONV 1340 _UTF16ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1341 UErrorCode *pErrorCode) { 1342 UConverter *cnv=pArgs->converter; 1343 const char *source=pArgs->source; 1344 const char *sourceLimit=pArgs->sourceLimit; 1345 int32_t *offsets=pArgs->offsets; 1346 1347 int32_t state, offsetDelta; 1348 uint8_t b; 1349 1350 state=cnv->mode; 1351 1352 /* 1353 * If we detect a BOM in this buffer, then we must add the BOM size to the 1354 * offsets because the actual converter function will not see and count the BOM. 1355 * offsetDelta will have the number of the BOM bytes that are in the current buffer. 1356 */ 1357 offsetDelta=0; 1358 1359 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { 1360 switch(state) { 1361 case 0: 1362 cnv->toUBytes[0]=(uint8_t)*source++; 1363 cnv->toULength=1; 1364 state=1; 1365 break; 1366 case 1: 1367 /* 1368 * Only inside this switch case can the state variable 1369 * temporarily take two additional values: 1370 * 6: BOM error, continue with BE 1371 * 7: BOM error, continue with LE 1372 */ 1373 b=*source; 1374 if(cnv->toUBytes[0]==0xfe && b==0xff) { 1375 if(IS_UTF16LE(cnv)) { 1376 state=7; /* illegal reverse BOM for Java "UnicodeLittle" */ 1377 } else { 1378 state=8; /* detect UTF-16BE */ 1379 } 1380 } else if(cnv->toUBytes[0]==0xff && b==0xfe) { 1381 if(IS_UTF16BE(cnv)) { 1382 state=6; /* illegal reverse BOM for Java "UnicodeBig" */ 1383 } else { 1384 state=9; /* detect UTF-16LE */ 1385 } 1386 } else if((IS_UTF16(cnv) && UCNV_GET_VERSION(cnv)==1)) { 1387 state=6; /* illegal missing BOM for Java "Unicode" */ 1388 } 1389 if(state>=8) { 1390 /* BOM detected, consume it */ 1391 ++source; 1392 cnv->toULength=0; 1393 offsetDelta=(int32_t)(source-pArgs->source); 1394 } else if(state<6) { 1395 /* ok: no BOM, and not a reverse BOM */ 1396 if(source!=pArgs->source) { 1397 /* reset the source for a correct first offset */ 1398 source=pArgs->source; 1399 cnv->toULength=0; 1400 } 1401 if(IS_UTF16LE(cnv)) { 1402 /* Make Java "UnicodeLittle" default to LE. */ 1403 state=9; 1404 } else { 1405 /* Make standard UTF-16 and Java "UnicodeBig" default to BE. */ 1406 state=8; 1407 } 1408 } else { 1409 /* 1410 * error: missing BOM, or reverse BOM 1411 * UTF-16,version=1: Java-specific "Unicode" requires a BOM. 1412 * UTF-16BE,version=1: Java-specific "UnicodeBig" requires a BE BOM or no BOM. 1413 * UTF-16LE,version=1: Java-specific "UnicodeLittle" requires an LE BOM or no BOM. 1414 */ 1415 /* report the non-BOM or reverse BOM as an illegal sequence */ 1416 cnv->toUBytes[1]=b; 1417 cnv->toULength=2; 1418 pArgs->source=source+1; 1419 /* continue with conversion if the callback resets the error */ 1420 /* 1421 * Make Java "Unicode" default to BE like standard UTF-16. 1422 * Make Java "UnicodeBig" and "UnicodeLittle" default 1423 * to their normal endiannesses. 1424 */ 1425 cnv->mode=state+2; 1426 *pErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE; 1427 return; 1428 } 1429 /* convert the rest of the stream */ 1430 cnv->mode=state; 1431 continue; 1432 case 8: 1433 /* call UTF-16BE */ 1434 pArgs->source=source; 1435 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); 1436 source=pArgs->source; 1437 break; 1438 case 9: 1439 /* call UTF-16LE */ 1440 pArgs->source=source; 1441 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); 1442 source=pArgs->source; 1443 break; 1444 default: 1445 break; /* does not occur */ 1446 } 1447 } 1448 1449 /* add BOM size to offsets - see comment at offsetDelta declaration */ 1450 if(offsets!=NULL && offsetDelta!=0) { 1451 int32_t *offsetsLimit=pArgs->offsets; 1452 while(offsets<offsetsLimit) { 1453 *offsets++ += offsetDelta; 1454 } 1455 } 1456 1457 pArgs->source=source; 1458 1459 if(source==sourceLimit && pArgs->flush) { 1460 /* handle truncated input */ 1461 switch(state) { 1462 case 0: 1463 break; /* no input at all, nothing to do */ 1464 case 8: 1465 _UTF16BEToUnicodeWithOffsets(pArgs, pErrorCode); 1466 break; 1467 case 9: 1468 _UTF16LEToUnicodeWithOffsets(pArgs, pErrorCode); 1469 break; 1470 default: 1471 /* 0<state<8: framework will report truncation, nothing to do here */ 1472 break; 1473 } 1474 } 1475 1476 cnv->mode=state; 1477 } 1478 1479 static UChar32 U_CALLCONV 1480 _UTF16GetNextUChar(UConverterToUnicodeArgs *pArgs, 1481 UErrorCode *pErrorCode) { 1482 switch(pArgs->converter->mode) { 1483 case 8: 1484 return _UTF16BEGetNextUChar(pArgs, pErrorCode); 1485 case 9: 1486 return _UTF16LEGetNextUChar(pArgs, pErrorCode); 1487 default: 1488 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 1489 } 1490 } 1491 U_CDECL_END 1492 1493 static const UConverterImpl _UTF16Impl = { 1494 UCNV_UTF16, 1495 1496 NULL, 1497 NULL, 1498 1499 _UTF16Open, 1500 NULL, 1501 _UTF16Reset, 1502 1503 _UTF16ToUnicodeWithOffsets, 1504 _UTF16ToUnicodeWithOffsets, 1505 _UTF16PEFromUnicodeWithOffsets, 1506 _UTF16PEFromUnicodeWithOffsets, 1507 _UTF16GetNextUChar, 1508 1509 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ 1510 _UTF16GetName, 1511 NULL, 1512 NULL, 1513 ucnv_getNonSurrogateUnicodeSet, 1514 1515 NULL, 1516 NULL 1517 }; 1518 1519 static const UConverterStaticData _UTF16StaticData = { 1520 sizeof(UConverterStaticData), 1521 "UTF-16", 1522 1204, /* CCSID for BOM sensitive UTF-16 */ 1523 UCNV_IBM, UCNV_UTF16, 2, 2, 1524 #if U_IS_BIG_ENDIAN 1525 { 0xff, 0xfd, 0, 0 }, 2, 1526 #else 1527 { 0xfd, 0xff, 0, 0 }, 2, 1528 #endif 1529 FALSE, FALSE, 1530 0, 1531 0, 1532 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1533 }; 1534 1535 const UConverterSharedData _UTF16Data = 1536 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16StaticData, &_UTF16Impl); 1537 1538 static const UConverterImpl _UTF16v2Impl = { 1539 UCNV_UTF16, 1540 1541 NULL, 1542 NULL, 1543 1544 _UTF16Open, 1545 NULL, 1546 _UTF16Reset, 1547 1548 _UTF16ToUnicodeWithOffsets, 1549 _UTF16ToUnicodeWithOffsets, 1550 _UTF16BEFromUnicodeWithOffsets, 1551 _UTF16BEFromUnicodeWithOffsets, 1552 _UTF16GetNextUChar, 1553 1554 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ 1555 _UTF16GetName, 1556 NULL, 1557 NULL, 1558 ucnv_getNonSurrogateUnicodeSet, 1559 1560 NULL, 1561 NULL 1562 }; 1563 1564 static const UConverterStaticData _UTF16v2StaticData = { 1565 sizeof(UConverterStaticData), 1566 "UTF-16,version=2", 1567 1204, /* CCSID for BOM sensitive UTF-16 */ 1568 UCNV_IBM, UCNV_UTF16, 2, 2, 1569 { 0xff, 0xfd, 0, 0 }, 2, 1570 FALSE, FALSE, 1571 0, 1572 0, 1573 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1574 }; 1575 1576 const UConverterSharedData _UTF16v2Data = 1577 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF16v2StaticData, &_UTF16v2Impl); 1578 1579 #endif 1580