1 /* 2 ********************************************************************** 3 * Copyright (C) 2002-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: ucnv_u32.c 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2002jul01 12 * created by: Markus W. Scherer 13 * 14 * UTF-32 converter implementation. Used to be in ucnv_utf.c. 15 */ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION 20 21 #include "unicode/ucnv.h" 22 #include "unicode/utf.h" 23 #include "ucnv_bld.h" 24 #include "ucnv_cnv.h" 25 #include "cmemory.h" 26 27 #define MAXIMUM_UCS2 0x0000FFFF 28 #define MAXIMUM_UTF 0x0010FFFF 29 #define HALF_SHIFT 10 30 #define HALF_BASE 0x0010000 31 #define HALF_MASK 0x3FF 32 #define SURROGATE_HIGH_START 0xD800 33 #define SURROGATE_LOW_START 0xDC00 34 35 /* -SURROGATE_LOW_START + HALF_BASE */ 36 #define SURROGATE_LOW_BASE 9216 37 38 enum { 39 UCNV_NEED_TO_WRITE_BOM=1 40 }; 41 42 /* UTF-32BE ----------------------------------------------------------------- */ 43 44 static void 45 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args, 46 UErrorCode * err) 47 { 48 const unsigned char *mySource = (unsigned char *) args->source; 49 UChar *myTarget = args->target; 50 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 51 const UChar *targetLimit = args->targetLimit; 52 unsigned char *toUBytes = args->converter->toUBytes; 53 uint32_t ch, i; 54 55 /* Restore state of current sequence */ 56 if (args->converter->toUnicodeStatus && myTarget < targetLimit) { 57 i = args->converter->toULength; /* restore # of bytes consumed */ 58 args->converter->toULength = 0; 59 60 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/ 61 args->converter->toUnicodeStatus = 0; 62 goto morebytes; 63 } 64 65 while (mySource < sourceLimit && myTarget < targetLimit) { 66 i = 0; 67 ch = 0; 68 morebytes: 69 while (i < sizeof(uint32_t)) { 70 if (mySource < sourceLimit) { 71 ch = (ch << 8) | (uint8_t)(*mySource); 72 toUBytes[i++] = (char) *(mySource++); 73 } 74 else { 75 /* stores a partially calculated target*/ 76 /* + 1 to make 0 a valid character */ 77 args->converter->toUnicodeStatus = ch + 1; 78 args->converter->toULength = (int8_t) i; 79 goto donefornow; 80 } 81 } 82 83 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { 84 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 85 if (ch <= MAXIMUM_UCS2) 86 { 87 /* fits in 16 bits */ 88 *(myTarget++) = (UChar) ch; 89 } 90 else { 91 /* write out the surrogates */ 92 *(myTarget++) = U16_LEAD(ch); 93 ch = U16_TRAIL(ch); 94 if (myTarget < targetLimit) { 95 *(myTarget++) = (UChar)ch; 96 } 97 else { 98 /* Put in overflow buffer (not handled here) */ 99 args->converter->UCharErrorBuffer[0] = (UChar) ch; 100 args->converter->UCharErrorBufferLength = 1; 101 *err = U_BUFFER_OVERFLOW_ERROR; 102 break; 103 } 104 } 105 } 106 else { 107 args->converter->toULength = (int8_t)i; 108 *err = U_ILLEGAL_CHAR_FOUND; 109 break; 110 } 111 } 112 113 donefornow: 114 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { 115 /* End of target buffer */ 116 *err = U_BUFFER_OVERFLOW_ERROR; 117 } 118 119 args->target = myTarget; 120 args->source = (const char *) mySource; 121 } 122 123 static void 124 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args, 125 UErrorCode * err) 126 { 127 const unsigned char *mySource = (unsigned char *) args->source; 128 UChar *myTarget = args->target; 129 int32_t *myOffsets = args->offsets; 130 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 131 const UChar *targetLimit = args->targetLimit; 132 unsigned char *toUBytes = args->converter->toUBytes; 133 uint32_t ch, i; 134 int32_t offsetNum = 0; 135 136 /* Restore state of current sequence */ 137 if (args->converter->toUnicodeStatus && myTarget < targetLimit) { 138 i = args->converter->toULength; /* restore # of bytes consumed */ 139 args->converter->toULength = 0; 140 141 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/ 142 args->converter->toUnicodeStatus = 0; 143 goto morebytes; 144 } 145 146 while (mySource < sourceLimit && myTarget < targetLimit) { 147 i = 0; 148 ch = 0; 149 morebytes: 150 while (i < sizeof(uint32_t)) { 151 if (mySource < sourceLimit) { 152 ch = (ch << 8) | (uint8_t)(*mySource); 153 toUBytes[i++] = (char) *(mySource++); 154 } 155 else { 156 /* stores a partially calculated target*/ 157 /* + 1 to make 0 a valid character */ 158 args->converter->toUnicodeStatus = ch + 1; 159 args->converter->toULength = (int8_t) i; 160 goto donefornow; 161 } 162 } 163 164 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { 165 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 166 if (ch <= MAXIMUM_UCS2) { 167 /* fits in 16 bits */ 168 *(myTarget++) = (UChar) ch; 169 *(myOffsets++) = offsetNum; 170 } 171 else { 172 /* write out the surrogates */ 173 *(myTarget++) = U16_LEAD(ch); 174 *myOffsets++ = offsetNum; 175 ch = U16_TRAIL(ch); 176 if (myTarget < targetLimit) 177 { 178 *(myTarget++) = (UChar)ch; 179 *(myOffsets++) = offsetNum; 180 } 181 else { 182 /* Put in overflow buffer (not handled here) */ 183 args->converter->UCharErrorBuffer[0] = (UChar) ch; 184 args->converter->UCharErrorBufferLength = 1; 185 *err = U_BUFFER_OVERFLOW_ERROR; 186 break; 187 } 188 } 189 } 190 else { 191 args->converter->toULength = (int8_t)i; 192 *err = U_ILLEGAL_CHAR_FOUND; 193 break; 194 } 195 offsetNum += i; 196 } 197 198 donefornow: 199 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 200 { 201 /* End of target buffer */ 202 *err = U_BUFFER_OVERFLOW_ERROR; 203 } 204 205 args->target = myTarget; 206 args->source = (const char *) mySource; 207 args->offsets = myOffsets; 208 } 209 210 static void 211 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args, 212 UErrorCode * err) 213 { 214 const UChar *mySource = args->source; 215 unsigned char *myTarget; 216 const UChar *sourceLimit = args->sourceLimit; 217 const unsigned char *targetLimit = (unsigned char *) args->targetLimit; 218 UChar32 ch, ch2; 219 unsigned int indexToWrite; 220 unsigned char temp[sizeof(uint32_t)]; 221 222 if(mySource >= sourceLimit) { 223 /* no input, nothing to do */ 224 return; 225 } 226 227 /* write the BOM if necessary */ 228 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 229 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff }; 230 ucnv_fromUWriteBytes(args->converter, 231 bom, 4, 232 &args->target, args->targetLimit, 233 &args->offsets, -1, 234 err); 235 args->converter->fromUnicodeStatus=0; 236 } 237 238 myTarget = (unsigned char *) args->target; 239 temp[0] = 0; 240 241 if (args->converter->fromUChar32) { 242 ch = args->converter->fromUChar32; 243 args->converter->fromUChar32 = 0; 244 goto lowsurogate; 245 } 246 247 while (mySource < sourceLimit && myTarget < targetLimit) { 248 ch = *(mySource++); 249 250 if (U_IS_SURROGATE(ch)) { 251 if (U_IS_LEAD(ch)) { 252 lowsurogate: 253 if (mySource < sourceLimit) { 254 ch2 = *mySource; 255 if (U_IS_TRAIL(ch2)) { 256 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; 257 mySource++; 258 } 259 else { 260 /* this is an unmatched trail code unit (2nd surrogate) */ 261 /* callback(illegal) */ 262 args->converter->fromUChar32 = ch; 263 *err = U_ILLEGAL_CHAR_FOUND; 264 break; 265 } 266 } 267 else { 268 /* ran out of source */ 269 args->converter->fromUChar32 = ch; 270 if (args->flush) { 271 /* this is an unmatched trail code unit (2nd surrogate) */ 272 /* callback(illegal) */ 273 *err = U_ILLEGAL_CHAR_FOUND; 274 } 275 break; 276 } 277 } 278 else { 279 /* this is an unmatched trail code unit (2nd surrogate) */ 280 /* callback(illegal) */ 281 args->converter->fromUChar32 = ch; 282 *err = U_ILLEGAL_CHAR_FOUND; 283 break; 284 } 285 } 286 287 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 288 temp[1] = (uint8_t) (ch >> 16 & 0x1F); 289 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ 290 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ 291 292 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) { 293 if (myTarget < targetLimit) { 294 *(myTarget++) = temp[indexToWrite]; 295 } 296 else { 297 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; 298 *err = U_BUFFER_OVERFLOW_ERROR; 299 } 300 } 301 } 302 303 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { 304 *err = U_BUFFER_OVERFLOW_ERROR; 305 } 306 307 args->target = (char *) myTarget; 308 args->source = mySource; 309 } 310 311 static void 312 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args, 313 UErrorCode * err) 314 { 315 const UChar *mySource = args->source; 316 unsigned char *myTarget; 317 int32_t *myOffsets; 318 const UChar *sourceLimit = args->sourceLimit; 319 const unsigned char *targetLimit = (unsigned char *) args->targetLimit; 320 UChar32 ch, ch2; 321 int32_t offsetNum = 0; 322 unsigned int indexToWrite; 323 unsigned char temp[sizeof(uint32_t)]; 324 325 if(mySource >= sourceLimit) { 326 /* no input, nothing to do */ 327 return; 328 } 329 330 /* write the BOM if necessary */ 331 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 332 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff }; 333 ucnv_fromUWriteBytes(args->converter, 334 bom, 4, 335 &args->target, args->targetLimit, 336 &args->offsets, -1, 337 err); 338 args->converter->fromUnicodeStatus=0; 339 } 340 341 myTarget = (unsigned char *) args->target; 342 myOffsets = args->offsets; 343 temp[0] = 0; 344 345 if (args->converter->fromUChar32) { 346 ch = args->converter->fromUChar32; 347 args->converter->fromUChar32 = 0; 348 goto lowsurogate; 349 } 350 351 while (mySource < sourceLimit && myTarget < targetLimit) { 352 ch = *(mySource++); 353 354 if (U_IS_SURROGATE(ch)) { 355 if (U_IS_LEAD(ch)) { 356 lowsurogate: 357 if (mySource < sourceLimit) { 358 ch2 = *mySource; 359 if (U_IS_TRAIL(ch2)) { 360 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; 361 mySource++; 362 } 363 else { 364 /* this is an unmatched trail code unit (2nd surrogate) */ 365 /* callback(illegal) */ 366 args->converter->fromUChar32 = ch; 367 *err = U_ILLEGAL_CHAR_FOUND; 368 break; 369 } 370 } 371 else { 372 /* ran out of source */ 373 args->converter->fromUChar32 = ch; 374 if (args->flush) { 375 /* this is an unmatched trail code unit (2nd surrogate) */ 376 /* callback(illegal) */ 377 *err = U_ILLEGAL_CHAR_FOUND; 378 } 379 break; 380 } 381 } 382 else { 383 /* this is an unmatched trail code unit (2nd surrogate) */ 384 /* callback(illegal) */ 385 args->converter->fromUChar32 = ch; 386 *err = U_ILLEGAL_CHAR_FOUND; 387 break; 388 } 389 } 390 391 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 392 temp[1] = (uint8_t) (ch >> 16 & 0x1F); 393 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ 394 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ 395 396 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) { 397 if (myTarget < targetLimit) { 398 *(myTarget++) = temp[indexToWrite]; 399 *(myOffsets++) = offsetNum; 400 } 401 else { 402 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; 403 *err = U_BUFFER_OVERFLOW_ERROR; 404 } 405 } 406 offsetNum = offsetNum + 1 + (temp[1] != 0); 407 } 408 409 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { 410 *err = U_BUFFER_OVERFLOW_ERROR; 411 } 412 413 args->target = (char *) myTarget; 414 args->source = mySource; 415 args->offsets = myOffsets; 416 } 417 418 static UChar32 419 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args, 420 UErrorCode* err) 421 { 422 const uint8_t *mySource; 423 UChar32 myUChar; 424 int32_t length; 425 426 mySource = (const uint8_t *)args->source; 427 if (mySource >= (const uint8_t *)args->sourceLimit) 428 { 429 /* no input */ 430 *err = U_INDEX_OUTOFBOUNDS_ERROR; 431 return 0xffff; 432 } 433 434 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource); 435 if (length < 4) 436 { 437 /* got a partial character */ 438 uprv_memcpy(args->converter->toUBytes, mySource, length); 439 args->converter->toULength = (int8_t)length; 440 args->source = (const char *)(mySource + length); 441 *err = U_TRUNCATED_CHAR_FOUND; 442 return 0xffff; 443 } 444 445 /* Don't even try to do a direct cast because the value may be on an odd address. */ 446 myUChar = ((UChar32)mySource[0] << 24) 447 | ((UChar32)mySource[1] << 16) 448 | ((UChar32)mySource[2] << 8) 449 | ((UChar32)mySource[3]); 450 451 args->source = (const char *)(mySource + 4); 452 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) { 453 return myUChar; 454 } 455 456 uprv_memcpy(args->converter->toUBytes, mySource, 4); 457 args->converter->toULength = 4; 458 459 *err = U_ILLEGAL_CHAR_FOUND; 460 return 0xffff; 461 } 462 463 static const UConverterImpl _UTF32BEImpl = { 464 UCNV_UTF32_BigEndian, 465 466 NULL, 467 NULL, 468 469 NULL, 470 NULL, 471 NULL, 472 473 T_UConverter_toUnicode_UTF32_BE, 474 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC, 475 T_UConverter_fromUnicode_UTF32_BE, 476 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC, 477 T_UConverter_getNextUChar_UTF32_BE, 478 479 NULL, 480 NULL, 481 NULL, 482 NULL, 483 ucnv_getNonSurrogateUnicodeSet 484 }; 485 486 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */ 487 static const UConverterStaticData _UTF32BEStaticData = { 488 sizeof(UConverterStaticData), 489 "UTF-32BE", 490 1232, 491 UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4, 492 { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE, 493 0, 494 0, 495 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 496 }; 497 498 const UConverterSharedData _UTF32BEData = 499 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32BEStaticData, &_UTF32BEImpl); 500 501 /* UTF-32LE ---------------------------------------------------------- */ 502 503 static void 504 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args, 505 UErrorCode * err) 506 { 507 const unsigned char *mySource = (unsigned char *) args->source; 508 UChar *myTarget = args->target; 509 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 510 const UChar *targetLimit = args->targetLimit; 511 unsigned char *toUBytes = args->converter->toUBytes; 512 uint32_t ch, i; 513 514 /* Restore state of current sequence */ 515 if (args->converter->toUnicodeStatus && myTarget < targetLimit) 516 { 517 i = args->converter->toULength; /* restore # of bytes consumed */ 518 args->converter->toULength = 0; 519 520 /* Stores the previously calculated ch from a previous call*/ 521 ch = args->converter->toUnicodeStatus - 1; 522 args->converter->toUnicodeStatus = 0; 523 goto morebytes; 524 } 525 526 while (mySource < sourceLimit && myTarget < targetLimit) 527 { 528 i = 0; 529 ch = 0; 530 morebytes: 531 while (i < sizeof(uint32_t)) 532 { 533 if (mySource < sourceLimit) 534 { 535 ch |= ((uint8_t)(*mySource)) << (i * 8); 536 toUBytes[i++] = (char) *(mySource++); 537 } 538 else 539 { 540 /* stores a partially calculated target*/ 541 /* + 1 to make 0 a valid character */ 542 args->converter->toUnicodeStatus = ch + 1; 543 args->converter->toULength = (int8_t) i; 544 goto donefornow; 545 } 546 } 547 548 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { 549 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 550 if (ch <= MAXIMUM_UCS2) { 551 /* fits in 16 bits */ 552 *(myTarget++) = (UChar) ch; 553 } 554 else { 555 /* write out the surrogates */ 556 *(myTarget++) = U16_LEAD(ch); 557 ch = U16_TRAIL(ch); 558 if (myTarget < targetLimit) { 559 *(myTarget++) = (UChar)ch; 560 } 561 else { 562 /* Put in overflow buffer (not handled here) */ 563 args->converter->UCharErrorBuffer[0] = (UChar) ch; 564 args->converter->UCharErrorBufferLength = 1; 565 *err = U_BUFFER_OVERFLOW_ERROR; 566 break; 567 } 568 } 569 } 570 else { 571 args->converter->toULength = (int8_t)i; 572 *err = U_ILLEGAL_CHAR_FOUND; 573 break; 574 } 575 } 576 577 donefornow: 578 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 579 { 580 /* End of target buffer */ 581 *err = U_BUFFER_OVERFLOW_ERROR; 582 } 583 584 args->target = myTarget; 585 args->source = (const char *) mySource; 586 } 587 588 static void 589 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args, 590 UErrorCode * err) 591 { 592 const unsigned char *mySource = (unsigned char *) args->source; 593 UChar *myTarget = args->target; 594 int32_t *myOffsets = args->offsets; 595 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 596 const UChar *targetLimit = args->targetLimit; 597 unsigned char *toUBytes = args->converter->toUBytes; 598 uint32_t ch, i; 599 int32_t offsetNum = 0; 600 601 /* Restore state of current sequence */ 602 if (args->converter->toUnicodeStatus && myTarget < targetLimit) 603 { 604 i = args->converter->toULength; /* restore # of bytes consumed */ 605 args->converter->toULength = 0; 606 607 /* Stores the previously calculated ch from a previous call*/ 608 ch = args->converter->toUnicodeStatus - 1; 609 args->converter->toUnicodeStatus = 0; 610 goto morebytes; 611 } 612 613 while (mySource < sourceLimit && myTarget < targetLimit) 614 { 615 i = 0; 616 ch = 0; 617 morebytes: 618 while (i < sizeof(uint32_t)) 619 { 620 if (mySource < sourceLimit) 621 { 622 ch |= ((uint8_t)(*mySource)) << (i * 8); 623 toUBytes[i++] = (char) *(mySource++); 624 } 625 else 626 { 627 /* stores a partially calculated target*/ 628 /* + 1 to make 0 a valid character */ 629 args->converter->toUnicodeStatus = ch + 1; 630 args->converter->toULength = (int8_t) i; 631 goto donefornow; 632 } 633 } 634 635 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) 636 { 637 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 638 if (ch <= MAXIMUM_UCS2) 639 { 640 /* fits in 16 bits */ 641 *(myTarget++) = (UChar) ch; 642 *(myOffsets++) = offsetNum; 643 } 644 else { 645 /* write out the surrogates */ 646 *(myTarget++) = U16_LEAD(ch); 647 *(myOffsets++) = offsetNum; 648 ch = U16_TRAIL(ch); 649 if (myTarget < targetLimit) 650 { 651 *(myTarget++) = (UChar)ch; 652 *(myOffsets++) = offsetNum; 653 } 654 else 655 { 656 /* Put in overflow buffer (not handled here) */ 657 args->converter->UCharErrorBuffer[0] = (UChar) ch; 658 args->converter->UCharErrorBufferLength = 1; 659 *err = U_BUFFER_OVERFLOW_ERROR; 660 break; 661 } 662 } 663 } 664 else 665 { 666 args->converter->toULength = (int8_t)i; 667 *err = U_ILLEGAL_CHAR_FOUND; 668 break; 669 } 670 offsetNum += i; 671 } 672 673 donefornow: 674 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 675 { 676 /* End of target buffer */ 677 *err = U_BUFFER_OVERFLOW_ERROR; 678 } 679 680 args->target = myTarget; 681 args->source = (const char *) mySource; 682 args->offsets = myOffsets; 683 } 684 685 static void 686 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args, 687 UErrorCode * err) 688 { 689 const UChar *mySource = args->source; 690 unsigned char *myTarget; 691 const UChar *sourceLimit = args->sourceLimit; 692 const unsigned char *targetLimit = (unsigned char *) args->targetLimit; 693 UChar32 ch, ch2; 694 unsigned int indexToWrite; 695 unsigned char temp[sizeof(uint32_t)]; 696 697 if(mySource >= sourceLimit) { 698 /* no input, nothing to do */ 699 return; 700 } 701 702 /* write the BOM if necessary */ 703 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 704 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 }; 705 ucnv_fromUWriteBytes(args->converter, 706 bom, 4, 707 &args->target, args->targetLimit, 708 &args->offsets, -1, 709 err); 710 args->converter->fromUnicodeStatus=0; 711 } 712 713 myTarget = (unsigned char *) args->target; 714 temp[3] = 0; 715 716 if (args->converter->fromUChar32) 717 { 718 ch = args->converter->fromUChar32; 719 args->converter->fromUChar32 = 0; 720 goto lowsurogate; 721 } 722 723 while (mySource < sourceLimit && myTarget < targetLimit) 724 { 725 ch = *(mySource++); 726 727 if (U16_IS_SURROGATE(ch)) { 728 if (U16_IS_LEAD(ch)) 729 { 730 lowsurogate: 731 if (mySource < sourceLimit) 732 { 733 ch2 = *mySource; 734 if (U16_IS_TRAIL(ch2)) { 735 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; 736 mySource++; 737 } 738 else { 739 /* this is an unmatched trail code unit (2nd surrogate) */ 740 /* callback(illegal) */ 741 args->converter->fromUChar32 = ch; 742 *err = U_ILLEGAL_CHAR_FOUND; 743 break; 744 } 745 } 746 else { 747 /* ran out of source */ 748 args->converter->fromUChar32 = ch; 749 if (args->flush) { 750 /* this is an unmatched trail code unit (2nd surrogate) */ 751 /* callback(illegal) */ 752 *err = U_ILLEGAL_CHAR_FOUND; 753 } 754 break; 755 } 756 } 757 else { 758 /* this is an unmatched trail code unit (2nd surrogate) */ 759 /* callback(illegal) */ 760 args->converter->fromUChar32 = ch; 761 *err = U_ILLEGAL_CHAR_FOUND; 762 break; 763 } 764 } 765 766 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 767 temp[2] = (uint8_t) (ch >> 16 & 0x1F); 768 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ 769 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ 770 771 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) 772 { 773 if (myTarget < targetLimit) 774 { 775 *(myTarget++) = temp[indexToWrite]; 776 } 777 else 778 { 779 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; 780 *err = U_BUFFER_OVERFLOW_ERROR; 781 } 782 } 783 } 784 785 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 786 { 787 *err = U_BUFFER_OVERFLOW_ERROR; 788 } 789 790 args->target = (char *) myTarget; 791 args->source = mySource; 792 } 793 794 static void 795 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args, 796 UErrorCode * err) 797 { 798 const UChar *mySource = args->source; 799 unsigned char *myTarget; 800 int32_t *myOffsets; 801 const UChar *sourceLimit = args->sourceLimit; 802 const unsigned char *targetLimit = (unsigned char *) args->targetLimit; 803 UChar32 ch, ch2; 804 unsigned int indexToWrite; 805 unsigned char temp[sizeof(uint32_t)]; 806 int32_t offsetNum = 0; 807 808 if(mySource >= sourceLimit) { 809 /* no input, nothing to do */ 810 return; 811 } 812 813 /* write the BOM if necessary */ 814 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 815 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 }; 816 ucnv_fromUWriteBytes(args->converter, 817 bom, 4, 818 &args->target, args->targetLimit, 819 &args->offsets, -1, 820 err); 821 args->converter->fromUnicodeStatus=0; 822 } 823 824 myTarget = (unsigned char *) args->target; 825 myOffsets = args->offsets; 826 temp[3] = 0; 827 828 if (args->converter->fromUChar32) 829 { 830 ch = args->converter->fromUChar32; 831 args->converter->fromUChar32 = 0; 832 goto lowsurogate; 833 } 834 835 while (mySource < sourceLimit && myTarget < targetLimit) 836 { 837 ch = *(mySource++); 838 839 if (U16_IS_SURROGATE(ch)) { 840 if (U16_IS_LEAD(ch)) 841 { 842 lowsurogate: 843 if (mySource < sourceLimit) 844 { 845 ch2 = *mySource; 846 if (U16_IS_TRAIL(ch2)) 847 { 848 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; 849 mySource++; 850 } 851 else { 852 /* this is an unmatched trail code unit (2nd surrogate) */ 853 /* callback(illegal) */ 854 args->converter->fromUChar32 = ch; 855 *err = U_ILLEGAL_CHAR_FOUND; 856 break; 857 } 858 } 859 else { 860 /* ran out of source */ 861 args->converter->fromUChar32 = ch; 862 if (args->flush) { 863 /* this is an unmatched trail code unit (2nd surrogate) */ 864 /* callback(illegal) */ 865 *err = U_ILLEGAL_CHAR_FOUND; 866 } 867 break; 868 } 869 } 870 else { 871 /* this is an unmatched trail code unit (2nd surrogate) */ 872 /* callback(illegal) */ 873 args->converter->fromUChar32 = ch; 874 *err = U_ILLEGAL_CHAR_FOUND; 875 break; 876 } 877 } 878 879 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 880 temp[2] = (uint8_t) (ch >> 16 & 0x1F); 881 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ 882 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ 883 884 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) 885 { 886 if (myTarget < targetLimit) 887 { 888 *(myTarget++) = temp[indexToWrite]; 889 *(myOffsets++) = offsetNum; 890 } 891 else 892 { 893 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; 894 *err = U_BUFFER_OVERFLOW_ERROR; 895 } 896 } 897 offsetNum = offsetNum + 1 + (temp[2] != 0); 898 } 899 900 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 901 { 902 *err = U_BUFFER_OVERFLOW_ERROR; 903 } 904 905 args->target = (char *) myTarget; 906 args->source = mySource; 907 args->offsets = myOffsets; 908 } 909 910 static UChar32 911 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args, 912 UErrorCode* err) 913 { 914 const uint8_t *mySource; 915 UChar32 myUChar; 916 int32_t length; 917 918 mySource = (const uint8_t *)args->source; 919 if (mySource >= (const uint8_t *)args->sourceLimit) 920 { 921 /* no input */ 922 *err = U_INDEX_OUTOFBOUNDS_ERROR; 923 return 0xffff; 924 } 925 926 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource); 927 if (length < 4) 928 { 929 /* got a partial character */ 930 uprv_memcpy(args->converter->toUBytes, mySource, length); 931 args->converter->toULength = (int8_t)length; 932 args->source = (const char *)(mySource + length); 933 *err = U_TRUNCATED_CHAR_FOUND; 934 return 0xffff; 935 } 936 937 /* Don't even try to do a direct cast because the value may be on an odd address. */ 938 myUChar = ((UChar32)mySource[3] << 24) 939 | ((UChar32)mySource[2] << 16) 940 | ((UChar32)mySource[1] << 8) 941 | ((UChar32)mySource[0]); 942 943 args->source = (const char *)(mySource + 4); 944 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) { 945 return myUChar; 946 } 947 948 uprv_memcpy(args->converter->toUBytes, mySource, 4); 949 args->converter->toULength = 4; 950 951 *err = U_ILLEGAL_CHAR_FOUND; 952 return 0xffff; 953 } 954 955 static const UConverterImpl _UTF32LEImpl = { 956 UCNV_UTF32_LittleEndian, 957 958 NULL, 959 NULL, 960 961 NULL, 962 NULL, 963 NULL, 964 965 T_UConverter_toUnicode_UTF32_LE, 966 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC, 967 T_UConverter_fromUnicode_UTF32_LE, 968 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC, 969 T_UConverter_getNextUChar_UTF32_LE, 970 971 NULL, 972 NULL, 973 NULL, 974 NULL, 975 ucnv_getNonSurrogateUnicodeSet 976 }; 977 978 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */ 979 static const UConverterStaticData _UTF32LEStaticData = { 980 sizeof(UConverterStaticData), 981 "UTF-32LE", 982 1234, 983 UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4, 984 { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE, 985 0, 986 0, 987 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 988 }; 989 990 991 const UConverterSharedData _UTF32LEData = 992 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32LEStaticData, &_UTF32LEImpl); 993 994 /* UTF-32 (Detect BOM) ------------------------------------------------------ */ 995 996 /* 997 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE 998 * accordingly. 999 * 1000 * State values: 1001 * 0 initial state 1002 * 1 saw 00 1003 * 2 saw 00 00 1004 * 3 saw 00 00 FE 1005 * 4 - 1006 * 5 saw FF 1007 * 6 saw FF FE 1008 * 7 saw FF FE 00 1009 * 8 UTF-32BE mode 1010 * 9 UTF-32LE mode 1011 * 1012 * During detection: state&3==number of matching bytes so far. 1013 * 1014 * On output, emit U+FEFF as the first code point. 1015 */ 1016 1017 static void 1018 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) { 1019 if(choice<=UCNV_RESET_TO_UNICODE) { 1020 /* reset toUnicode: state=0 */ 1021 cnv->mode=0; 1022 } 1023 if(choice!=UCNV_RESET_TO_UNICODE) { 1024 /* reset fromUnicode: prepare to output the UTF-32PE BOM */ 1025 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; 1026 } 1027 } 1028 1029 static void 1030 _UTF32Open(UConverter *cnv, 1031 UConverterLoadArgs *pArgs, 1032 UErrorCode *pErrorCode) { 1033 _UTF32Reset(cnv, UCNV_RESET_BOTH); 1034 } 1035 1036 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 }; 1037 1038 static void 1039 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1040 UErrorCode *pErrorCode) { 1041 UConverter *cnv=pArgs->converter; 1042 const char *source=pArgs->source; 1043 const char *sourceLimit=pArgs->sourceLimit; 1044 int32_t *offsets=pArgs->offsets; 1045 1046 int32_t state, offsetDelta; 1047 char b; 1048 1049 state=cnv->mode; 1050 1051 /* 1052 * If we detect a BOM in this buffer, then we must add the BOM size to the 1053 * offsets because the actual converter function will not see and count the BOM. 1054 * offsetDelta will have the number of the BOM bytes that are in the current buffer. 1055 */ 1056 offsetDelta=0; 1057 1058 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { 1059 switch(state) { 1060 case 0: 1061 b=*source; 1062 if(b==0) { 1063 state=1; /* could be 00 00 FE FF */ 1064 } else if(b==(char)0xff) { 1065 state=5; /* could be FF FE 00 00 */ 1066 } else { 1067 state=8; /* default to UTF-32BE */ 1068 continue; 1069 } 1070 ++source; 1071 break; 1072 case 1: 1073 case 2: 1074 case 3: 1075 case 5: 1076 case 6: 1077 case 7: 1078 if(*source==utf32BOM[state]) { 1079 ++state; 1080 ++source; 1081 if(state==4) { 1082 state=8; /* detect UTF-32BE */ 1083 offsetDelta=(int32_t)(source-pArgs->source); 1084 } else if(state==8) { 1085 state=9; /* detect UTF-32LE */ 1086 offsetDelta=(int32_t)(source-pArgs->source); 1087 } 1088 } else { 1089 /* switch to UTF-32BE and pass the previous bytes */ 1090 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */ 1091 1092 /* reset the source */ 1093 source=pArgs->source; 1094 1095 if(count==(state&3)) { 1096 /* simple: all in the same buffer, just reset source */ 1097 } else { 1098 UBool oldFlush=pArgs->flush; 1099 1100 /* some of the bytes are from a previous buffer, replay those first */ 1101 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */ 1102 pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */ 1103 pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */ 1104 1105 /* no offsets: bytes from previous buffer, and not enough for output */ 1106 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); 1107 1108 /* restore real pointers; pArgs->source will be set in case 8/9 */ 1109 pArgs->sourceLimit=sourceLimit; 1110 pArgs->flush=oldFlush; 1111 } 1112 state=8; 1113 continue; 1114 } 1115 break; 1116 case 8: 1117 /* call UTF-32BE */ 1118 pArgs->source=source; 1119 if(offsets==NULL) { 1120 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); 1121 } else { 1122 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode); 1123 } 1124 source=pArgs->source; 1125 break; 1126 case 9: 1127 /* call UTF-32LE */ 1128 pArgs->source=source; 1129 if(offsets==NULL) { 1130 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode); 1131 } else { 1132 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode); 1133 } 1134 source=pArgs->source; 1135 break; 1136 default: 1137 break; /* does not occur */ 1138 } 1139 } 1140 1141 /* add BOM size to offsets - see comment at offsetDelta declaration */ 1142 if(offsets!=NULL && offsetDelta!=0) { 1143 int32_t *offsetsLimit=pArgs->offsets; 1144 while(offsets<offsetsLimit) { 1145 *offsets++ += offsetDelta; 1146 } 1147 } 1148 1149 pArgs->source=source; 1150 1151 if(source==sourceLimit && pArgs->flush) { 1152 /* handle truncated input */ 1153 switch(state) { 1154 case 0: 1155 break; /* no input at all, nothing to do */ 1156 case 8: 1157 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); 1158 break; 1159 case 9: 1160 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode); 1161 break; 1162 default: 1163 /* handle 0<state<8: call UTF-32BE with too-short input */ 1164 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */ 1165 pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */ 1166 1167 /* no offsets: not enough for output */ 1168 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); 1169 pArgs->source=source; 1170 pArgs->sourceLimit=sourceLimit; 1171 state=8; 1172 break; 1173 } 1174 } 1175 1176 cnv->mode=state; 1177 } 1178 1179 static UChar32 1180 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs, 1181 UErrorCode *pErrorCode) { 1182 switch(pArgs->converter->mode) { 1183 case 8: 1184 return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode); 1185 case 9: 1186 return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode); 1187 default: 1188 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 1189 } 1190 } 1191 1192 static const UConverterImpl _UTF32Impl = { 1193 UCNV_UTF32, 1194 1195 NULL, 1196 NULL, 1197 1198 _UTF32Open, 1199 NULL, 1200 _UTF32Reset, 1201 1202 _UTF32ToUnicodeWithOffsets, 1203 _UTF32ToUnicodeWithOffsets, 1204 #if U_IS_BIG_ENDIAN 1205 T_UConverter_fromUnicode_UTF32_BE, 1206 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC, 1207 #else 1208 T_UConverter_fromUnicode_UTF32_LE, 1209 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC, 1210 #endif 1211 _UTF32GetNextUChar, 1212 1213 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ 1214 NULL, 1215 NULL, 1216 NULL, 1217 ucnv_getNonSurrogateUnicodeSet 1218 }; 1219 1220 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */ 1221 static const UConverterStaticData _UTF32StaticData = { 1222 sizeof(UConverterStaticData), 1223 "UTF-32", 1224 1236, 1225 UCNV_IBM, UCNV_UTF32, 4, 4, 1226 #if U_IS_BIG_ENDIAN 1227 { 0, 0, 0xff, 0xfd }, 4, 1228 #else 1229 { 0xfd, 0xff, 0, 0 }, 4, 1230 #endif 1231 FALSE, FALSE, 1232 0, 1233 0, 1234 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1235 }; 1236 1237 const UConverterSharedData _UTF32Data = 1238 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32StaticData, &_UTF32Impl); 1239 1240 #endif 1241