1 /* 2 ********************************************************************** 3 * Copyright (C) 2002-2011, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: ucnv_u32.c 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2002jul01 12 * created by: Markus W. Scherer 13 * 14 * UTF-32 converter implementation. Used to be in ucnv_utf.c. 15 */ 16 17 #include "unicode/utypes.h" 18 19 #if !UCONFIG_NO_CONVERSION 20 21 #include "unicode/ucnv.h" 22 #include "unicode/utf.h" 23 #include "ucnv_bld.h" 24 #include "ucnv_cnv.h" 25 #include "cmemory.h" 26 27 #define MAXIMUM_UCS2 0x0000FFFF 28 #define MAXIMUM_UTF 0x0010FFFF 29 #define HALF_SHIFT 10 30 #define HALF_BASE 0x0010000 31 #define HALF_MASK 0x3FF 32 #define SURROGATE_HIGH_START 0xD800 33 #define SURROGATE_LOW_START 0xDC00 34 35 /* -SURROGATE_LOW_START + HALF_BASE */ 36 #define SURROGATE_LOW_BASE 9216 37 38 enum { 39 UCNV_NEED_TO_WRITE_BOM=1 40 }; 41 42 /* UTF-32BE ----------------------------------------------------------------- */ 43 44 static void 45 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args, 46 UErrorCode * err) 47 { 48 const unsigned char *mySource = (unsigned char *) args->source; 49 UChar *myTarget = args->target; 50 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 51 const UChar *targetLimit = args->targetLimit; 52 unsigned char *toUBytes = args->converter->toUBytes; 53 uint32_t ch, i; 54 55 /* Restore state of current sequence */ 56 if (args->converter->toUnicodeStatus && myTarget < targetLimit) { 57 i = args->converter->toULength; /* restore # of bytes consumed */ 58 args->converter->toULength = 0; 59 60 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/ 61 args->converter->toUnicodeStatus = 0; 62 goto morebytes; 63 } 64 65 while (mySource < sourceLimit && myTarget < targetLimit) { 66 i = 0; 67 ch = 0; 68 morebytes: 69 while (i < sizeof(uint32_t)) { 70 if (mySource < sourceLimit) { 71 ch = (ch << 8) | (uint8_t)(*mySource); 72 toUBytes[i++] = (char) *(mySource++); 73 } 74 else { 75 /* stores a partially calculated target*/ 76 /* + 1 to make 0 a valid character */ 77 args->converter->toUnicodeStatus = ch + 1; 78 args->converter->toULength = (int8_t) i; 79 goto donefornow; 80 } 81 } 82 83 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { 84 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 85 if (ch <= MAXIMUM_UCS2) 86 { 87 /* fits in 16 bits */ 88 *(myTarget++) = (UChar) ch; 89 } 90 else { 91 /* write out the surrogates */ 92 *(myTarget++) = U16_LEAD(ch); 93 ch = U16_TRAIL(ch); 94 if (myTarget < targetLimit) { 95 *(myTarget++) = (UChar)ch; 96 } 97 else { 98 /* Put in overflow buffer (not handled here) */ 99 args->converter->UCharErrorBuffer[0] = (UChar) ch; 100 args->converter->UCharErrorBufferLength = 1; 101 *err = U_BUFFER_OVERFLOW_ERROR; 102 break; 103 } 104 } 105 } 106 else { 107 args->converter->toULength = (int8_t)i; 108 *err = U_ILLEGAL_CHAR_FOUND; 109 break; 110 } 111 } 112 113 donefornow: 114 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { 115 /* End of target buffer */ 116 *err = U_BUFFER_OVERFLOW_ERROR; 117 } 118 119 args->target = myTarget; 120 args->source = (const char *) mySource; 121 } 122 123 static void 124 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args, 125 UErrorCode * err) 126 { 127 const unsigned char *mySource = (unsigned char *) args->source; 128 UChar *myTarget = args->target; 129 int32_t *myOffsets = args->offsets; 130 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 131 const UChar *targetLimit = args->targetLimit; 132 unsigned char *toUBytes = args->converter->toUBytes; 133 uint32_t ch, i; 134 int32_t offsetNum = 0; 135 136 /* Restore state of current sequence */ 137 if (args->converter->toUnicodeStatus && myTarget < targetLimit) { 138 i = args->converter->toULength; /* restore # of bytes consumed */ 139 args->converter->toULength = 0; 140 141 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/ 142 args->converter->toUnicodeStatus = 0; 143 goto morebytes; 144 } 145 146 while (mySource < sourceLimit && myTarget < targetLimit) { 147 i = 0; 148 ch = 0; 149 morebytes: 150 while (i < sizeof(uint32_t)) { 151 if (mySource < sourceLimit) { 152 ch = (ch << 8) | (uint8_t)(*mySource); 153 toUBytes[i++] = (char) *(mySource++); 154 } 155 else { 156 /* stores a partially calculated target*/ 157 /* + 1 to make 0 a valid character */ 158 args->converter->toUnicodeStatus = ch + 1; 159 args->converter->toULength = (int8_t) i; 160 goto donefornow; 161 } 162 } 163 164 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { 165 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 166 if (ch <= MAXIMUM_UCS2) { 167 /* fits in 16 bits */ 168 *(myTarget++) = (UChar) ch; 169 *(myOffsets++) = offsetNum; 170 } 171 else { 172 /* write out the surrogates */ 173 *(myTarget++) = U16_LEAD(ch); 174 *myOffsets++ = offsetNum; 175 ch = U16_TRAIL(ch); 176 if (myTarget < targetLimit) 177 { 178 *(myTarget++) = (UChar)ch; 179 *(myOffsets++) = offsetNum; 180 } 181 else { 182 /* Put in overflow buffer (not handled here) */ 183 args->converter->UCharErrorBuffer[0] = (UChar) ch; 184 args->converter->UCharErrorBufferLength = 1; 185 *err = U_BUFFER_OVERFLOW_ERROR; 186 break; 187 } 188 } 189 } 190 else { 191 args->converter->toULength = (int8_t)i; 192 *err = U_ILLEGAL_CHAR_FOUND; 193 break; 194 } 195 offsetNum += i; 196 } 197 198 donefornow: 199 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 200 { 201 /* End of target buffer */ 202 *err = U_BUFFER_OVERFLOW_ERROR; 203 } 204 205 args->target = myTarget; 206 args->source = (const char *) mySource; 207 args->offsets = myOffsets; 208 } 209 210 static void 211 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args, 212 UErrorCode * err) 213 { 214 const UChar *mySource = args->source; 215 unsigned char *myTarget; 216 const UChar *sourceLimit = args->sourceLimit; 217 const unsigned char *targetLimit = (unsigned char *) args->targetLimit; 218 UChar32 ch, ch2; 219 unsigned int indexToWrite; 220 unsigned char temp[sizeof(uint32_t)]; 221 222 if(mySource >= sourceLimit) { 223 /* no input, nothing to do */ 224 return; 225 } 226 227 /* write the BOM if necessary */ 228 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 229 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff }; 230 ucnv_fromUWriteBytes(args->converter, 231 bom, 4, 232 &args->target, args->targetLimit, 233 &args->offsets, -1, 234 err); 235 args->converter->fromUnicodeStatus=0; 236 } 237 238 myTarget = (unsigned char *) args->target; 239 temp[0] = 0; 240 241 if (args->converter->fromUChar32) { 242 ch = args->converter->fromUChar32; 243 args->converter->fromUChar32 = 0; 244 goto lowsurogate; 245 } 246 247 while (mySource < sourceLimit && myTarget < targetLimit) { 248 ch = *(mySource++); 249 250 if (U_IS_SURROGATE(ch)) { 251 if (U_IS_LEAD(ch)) { 252 lowsurogate: 253 if (mySource < sourceLimit) { 254 ch2 = *mySource; 255 if (U_IS_TRAIL(ch2)) { 256 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; 257 mySource++; 258 } 259 else { 260 /* this is an unmatched trail code unit (2nd surrogate) */ 261 /* callback(illegal) */ 262 args->converter->fromUChar32 = ch; 263 *err = U_ILLEGAL_CHAR_FOUND; 264 break; 265 } 266 } 267 else { 268 /* ran out of source */ 269 args->converter->fromUChar32 = ch; 270 if (args->flush) { 271 /* this is an unmatched trail code unit (2nd surrogate) */ 272 /* callback(illegal) */ 273 *err = U_ILLEGAL_CHAR_FOUND; 274 } 275 break; 276 } 277 } 278 else { 279 /* this is an unmatched trail code unit (2nd surrogate) */ 280 /* callback(illegal) */ 281 args->converter->fromUChar32 = ch; 282 *err = U_ILLEGAL_CHAR_FOUND; 283 break; 284 } 285 } 286 287 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 288 temp[1] = (uint8_t) (ch >> 16 & 0x1F); 289 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ 290 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ 291 292 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) { 293 if (myTarget < targetLimit) { 294 *(myTarget++) = temp[indexToWrite]; 295 } 296 else { 297 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; 298 *err = U_BUFFER_OVERFLOW_ERROR; 299 } 300 } 301 } 302 303 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { 304 *err = U_BUFFER_OVERFLOW_ERROR; 305 } 306 307 args->target = (char *) myTarget; 308 args->source = mySource; 309 } 310 311 static void 312 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args, 313 UErrorCode * err) 314 { 315 const UChar *mySource = args->source; 316 unsigned char *myTarget; 317 int32_t *myOffsets; 318 const UChar *sourceLimit = args->sourceLimit; 319 const unsigned char *targetLimit = (unsigned char *) args->targetLimit; 320 UChar32 ch, ch2; 321 int32_t offsetNum = 0; 322 unsigned int indexToWrite; 323 unsigned char temp[sizeof(uint32_t)]; 324 325 if(mySource >= sourceLimit) { 326 /* no input, nothing to do */ 327 return; 328 } 329 330 /* write the BOM if necessary */ 331 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 332 static const char bom[]={ 0, 0, (char)0xfe, (char)0xff }; 333 ucnv_fromUWriteBytes(args->converter, 334 bom, 4, 335 &args->target, args->targetLimit, 336 &args->offsets, -1, 337 err); 338 args->converter->fromUnicodeStatus=0; 339 } 340 341 myTarget = (unsigned char *) args->target; 342 myOffsets = args->offsets; 343 temp[0] = 0; 344 345 if (args->converter->fromUChar32) { 346 ch = args->converter->fromUChar32; 347 args->converter->fromUChar32 = 0; 348 goto lowsurogate; 349 } 350 351 while (mySource < sourceLimit && myTarget < targetLimit) { 352 ch = *(mySource++); 353 354 if (U_IS_SURROGATE(ch)) { 355 if (U_IS_LEAD(ch)) { 356 lowsurogate: 357 if (mySource < sourceLimit) { 358 ch2 = *mySource; 359 if (U_IS_TRAIL(ch2)) { 360 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; 361 mySource++; 362 } 363 else { 364 /* this is an unmatched trail code unit (2nd surrogate) */ 365 /* callback(illegal) */ 366 args->converter->fromUChar32 = ch; 367 *err = U_ILLEGAL_CHAR_FOUND; 368 break; 369 } 370 } 371 else { 372 /* ran out of source */ 373 args->converter->fromUChar32 = ch; 374 if (args->flush) { 375 /* this is an unmatched trail code unit (2nd surrogate) */ 376 /* callback(illegal) */ 377 *err = U_ILLEGAL_CHAR_FOUND; 378 } 379 break; 380 } 381 } 382 else { 383 /* this is an unmatched trail code unit (2nd surrogate) */ 384 /* callback(illegal) */ 385 args->converter->fromUChar32 = ch; 386 *err = U_ILLEGAL_CHAR_FOUND; 387 break; 388 } 389 } 390 391 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 392 temp[1] = (uint8_t) (ch >> 16 & 0x1F); 393 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ 394 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ 395 396 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) { 397 if (myTarget < targetLimit) { 398 *(myTarget++) = temp[indexToWrite]; 399 *(myOffsets++) = offsetNum; 400 } 401 else { 402 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; 403 *err = U_BUFFER_OVERFLOW_ERROR; 404 } 405 } 406 offsetNum = offsetNum + 1 + (temp[1] != 0); 407 } 408 409 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { 410 *err = U_BUFFER_OVERFLOW_ERROR; 411 } 412 413 args->target = (char *) myTarget; 414 args->source = mySource; 415 args->offsets = myOffsets; 416 } 417 418 static UChar32 419 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args, 420 UErrorCode* err) 421 { 422 const uint8_t *mySource; 423 UChar32 myUChar; 424 int32_t length; 425 426 mySource = (const uint8_t *)args->source; 427 if (mySource >= (const uint8_t *)args->sourceLimit) 428 { 429 /* no input */ 430 *err = U_INDEX_OUTOFBOUNDS_ERROR; 431 return 0xffff; 432 } 433 434 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource); 435 if (length < 4) 436 { 437 /* got a partial character */ 438 uprv_memcpy(args->converter->toUBytes, mySource, length); 439 args->converter->toULength = (int8_t)length; 440 args->source = (const char *)(mySource + length); 441 *err = U_TRUNCATED_CHAR_FOUND; 442 return 0xffff; 443 } 444 445 /* Don't even try to do a direct cast because the value may be on an odd address. */ 446 myUChar = ((UChar32)mySource[0] << 24) 447 | ((UChar32)mySource[1] << 16) 448 | ((UChar32)mySource[2] << 8) 449 | ((UChar32)mySource[3]); 450 451 args->source = (const char *)(mySource + 4); 452 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) { 453 return myUChar; 454 } 455 456 uprv_memcpy(args->converter->toUBytes, mySource, 4); 457 args->converter->toULength = 4; 458 459 *err = U_ILLEGAL_CHAR_FOUND; 460 return 0xffff; 461 } 462 463 static const UConverterImpl _UTF32BEImpl = { 464 UCNV_UTF32_BigEndian, 465 466 NULL, 467 NULL, 468 469 NULL, 470 NULL, 471 NULL, 472 473 T_UConverter_toUnicode_UTF32_BE, 474 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC, 475 T_UConverter_fromUnicode_UTF32_BE, 476 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC, 477 T_UConverter_getNextUChar_UTF32_BE, 478 479 NULL, 480 NULL, 481 NULL, 482 NULL, 483 ucnv_getNonSurrogateUnicodeSet 484 }; 485 486 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */ 487 static const UConverterStaticData _UTF32BEStaticData = { 488 sizeof(UConverterStaticData), 489 "UTF-32BE", 490 1232, 491 UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4, 492 { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE, 493 0, 494 0, 495 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 496 }; 497 498 const UConverterSharedData _UTF32BEData = { 499 sizeof(UConverterSharedData), ~((uint32_t) 0), 500 NULL, NULL, &_UTF32BEStaticData, FALSE, &_UTF32BEImpl, 501 0 502 }; 503 504 /* UTF-32LE ---------------------------------------------------------- */ 505 506 static void 507 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args, 508 UErrorCode * err) 509 { 510 const unsigned char *mySource = (unsigned char *) args->source; 511 UChar *myTarget = args->target; 512 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 513 const UChar *targetLimit = args->targetLimit; 514 unsigned char *toUBytes = args->converter->toUBytes; 515 uint32_t ch, i; 516 517 /* Restore state of current sequence */ 518 if (args->converter->toUnicodeStatus && myTarget < targetLimit) 519 { 520 i = args->converter->toULength; /* restore # of bytes consumed */ 521 args->converter->toULength = 0; 522 523 /* Stores the previously calculated ch from a previous call*/ 524 ch = args->converter->toUnicodeStatus - 1; 525 args->converter->toUnicodeStatus = 0; 526 goto morebytes; 527 } 528 529 while (mySource < sourceLimit && myTarget < targetLimit) 530 { 531 i = 0; 532 ch = 0; 533 morebytes: 534 while (i < sizeof(uint32_t)) 535 { 536 if (mySource < sourceLimit) 537 { 538 ch |= ((uint8_t)(*mySource)) << (i * 8); 539 toUBytes[i++] = (char) *(mySource++); 540 } 541 else 542 { 543 /* stores a partially calculated target*/ 544 /* + 1 to make 0 a valid character */ 545 args->converter->toUnicodeStatus = ch + 1; 546 args->converter->toULength = (int8_t) i; 547 goto donefornow; 548 } 549 } 550 551 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { 552 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 553 if (ch <= MAXIMUM_UCS2) { 554 /* fits in 16 bits */ 555 *(myTarget++) = (UChar) ch; 556 } 557 else { 558 /* write out the surrogates */ 559 *(myTarget++) = U16_LEAD(ch); 560 ch = U16_TRAIL(ch); 561 if (myTarget < targetLimit) { 562 *(myTarget++) = (UChar)ch; 563 } 564 else { 565 /* Put in overflow buffer (not handled here) */ 566 args->converter->UCharErrorBuffer[0] = (UChar) ch; 567 args->converter->UCharErrorBufferLength = 1; 568 *err = U_BUFFER_OVERFLOW_ERROR; 569 break; 570 } 571 } 572 } 573 else { 574 args->converter->toULength = (int8_t)i; 575 *err = U_ILLEGAL_CHAR_FOUND; 576 break; 577 } 578 } 579 580 donefornow: 581 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 582 { 583 /* End of target buffer */ 584 *err = U_BUFFER_OVERFLOW_ERROR; 585 } 586 587 args->target = myTarget; 588 args->source = (const char *) mySource; 589 } 590 591 static void 592 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args, 593 UErrorCode * err) 594 { 595 const unsigned char *mySource = (unsigned char *) args->source; 596 UChar *myTarget = args->target; 597 int32_t *myOffsets = args->offsets; 598 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 599 const UChar *targetLimit = args->targetLimit; 600 unsigned char *toUBytes = args->converter->toUBytes; 601 uint32_t ch, i; 602 int32_t offsetNum = 0; 603 604 /* Restore state of current sequence */ 605 if (args->converter->toUnicodeStatus && myTarget < targetLimit) 606 { 607 i = args->converter->toULength; /* restore # of bytes consumed */ 608 args->converter->toULength = 0; 609 610 /* Stores the previously calculated ch from a previous call*/ 611 ch = args->converter->toUnicodeStatus - 1; 612 args->converter->toUnicodeStatus = 0; 613 goto morebytes; 614 } 615 616 while (mySource < sourceLimit && myTarget < targetLimit) 617 { 618 i = 0; 619 ch = 0; 620 morebytes: 621 while (i < sizeof(uint32_t)) 622 { 623 if (mySource < sourceLimit) 624 { 625 ch |= ((uint8_t)(*mySource)) << (i * 8); 626 toUBytes[i++] = (char) *(mySource++); 627 } 628 else 629 { 630 /* stores a partially calculated target*/ 631 /* + 1 to make 0 a valid character */ 632 args->converter->toUnicodeStatus = ch + 1; 633 args->converter->toULength = (int8_t) i; 634 goto donefornow; 635 } 636 } 637 638 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) 639 { 640 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 641 if (ch <= MAXIMUM_UCS2) 642 { 643 /* fits in 16 bits */ 644 *(myTarget++) = (UChar) ch; 645 *(myOffsets++) = offsetNum; 646 } 647 else { 648 /* write out the surrogates */ 649 *(myTarget++) = U16_LEAD(ch); 650 *(myOffsets++) = offsetNum; 651 ch = U16_TRAIL(ch); 652 if (myTarget < targetLimit) 653 { 654 *(myTarget++) = (UChar)ch; 655 *(myOffsets++) = offsetNum; 656 } 657 else 658 { 659 /* Put in overflow buffer (not handled here) */ 660 args->converter->UCharErrorBuffer[0] = (UChar) ch; 661 args->converter->UCharErrorBufferLength = 1; 662 *err = U_BUFFER_OVERFLOW_ERROR; 663 break; 664 } 665 } 666 } 667 else 668 { 669 args->converter->toULength = (int8_t)i; 670 *err = U_ILLEGAL_CHAR_FOUND; 671 break; 672 } 673 offsetNum += i; 674 } 675 676 donefornow: 677 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 678 { 679 /* End of target buffer */ 680 *err = U_BUFFER_OVERFLOW_ERROR; 681 } 682 683 args->target = myTarget; 684 args->source = (const char *) mySource; 685 args->offsets = myOffsets; 686 } 687 688 static void 689 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args, 690 UErrorCode * err) 691 { 692 const UChar *mySource = args->source; 693 unsigned char *myTarget; 694 const UChar *sourceLimit = args->sourceLimit; 695 const unsigned char *targetLimit = (unsigned char *) args->targetLimit; 696 UChar32 ch, ch2; 697 unsigned int indexToWrite; 698 unsigned char temp[sizeof(uint32_t)]; 699 700 if(mySource >= sourceLimit) { 701 /* no input, nothing to do */ 702 return; 703 } 704 705 /* write the BOM if necessary */ 706 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 707 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 }; 708 ucnv_fromUWriteBytes(args->converter, 709 bom, 4, 710 &args->target, args->targetLimit, 711 &args->offsets, -1, 712 err); 713 args->converter->fromUnicodeStatus=0; 714 } 715 716 myTarget = (unsigned char *) args->target; 717 temp[3] = 0; 718 719 if (args->converter->fromUChar32) 720 { 721 ch = args->converter->fromUChar32; 722 args->converter->fromUChar32 = 0; 723 goto lowsurogate; 724 } 725 726 while (mySource < sourceLimit && myTarget < targetLimit) 727 { 728 ch = *(mySource++); 729 730 if (U16_IS_SURROGATE(ch)) { 731 if (U16_IS_LEAD(ch)) 732 { 733 lowsurogate: 734 if (mySource < sourceLimit) 735 { 736 ch2 = *mySource; 737 if (U16_IS_TRAIL(ch2)) { 738 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; 739 mySource++; 740 } 741 else { 742 /* this is an unmatched trail code unit (2nd surrogate) */ 743 /* callback(illegal) */ 744 args->converter->fromUChar32 = ch; 745 *err = U_ILLEGAL_CHAR_FOUND; 746 break; 747 } 748 } 749 else { 750 /* ran out of source */ 751 args->converter->fromUChar32 = ch; 752 if (args->flush) { 753 /* this is an unmatched trail code unit (2nd surrogate) */ 754 /* callback(illegal) */ 755 *err = U_ILLEGAL_CHAR_FOUND; 756 } 757 break; 758 } 759 } 760 else { 761 /* this is an unmatched trail code unit (2nd surrogate) */ 762 /* callback(illegal) */ 763 args->converter->fromUChar32 = ch; 764 *err = U_ILLEGAL_CHAR_FOUND; 765 break; 766 } 767 } 768 769 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 770 temp[2] = (uint8_t) (ch >> 16 & 0x1F); 771 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ 772 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ 773 774 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) 775 { 776 if (myTarget < targetLimit) 777 { 778 *(myTarget++) = temp[indexToWrite]; 779 } 780 else 781 { 782 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; 783 *err = U_BUFFER_OVERFLOW_ERROR; 784 } 785 } 786 } 787 788 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 789 { 790 *err = U_BUFFER_OVERFLOW_ERROR; 791 } 792 793 args->target = (char *) myTarget; 794 args->source = mySource; 795 } 796 797 static void 798 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args, 799 UErrorCode * err) 800 { 801 const UChar *mySource = args->source; 802 unsigned char *myTarget; 803 int32_t *myOffsets; 804 const UChar *sourceLimit = args->sourceLimit; 805 const unsigned char *targetLimit = (unsigned char *) args->targetLimit; 806 UChar32 ch, ch2; 807 unsigned int indexToWrite; 808 unsigned char temp[sizeof(uint32_t)]; 809 int32_t offsetNum = 0; 810 811 if(mySource >= sourceLimit) { 812 /* no input, nothing to do */ 813 return; 814 } 815 816 /* write the BOM if necessary */ 817 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 818 static const char bom[]={ (char)0xff, (char)0xfe, 0, 0 }; 819 ucnv_fromUWriteBytes(args->converter, 820 bom, 4, 821 &args->target, args->targetLimit, 822 &args->offsets, -1, 823 err); 824 args->converter->fromUnicodeStatus=0; 825 } 826 827 myTarget = (unsigned char *) args->target; 828 myOffsets = args->offsets; 829 temp[3] = 0; 830 831 if (args->converter->fromUChar32) 832 { 833 ch = args->converter->fromUChar32; 834 args->converter->fromUChar32 = 0; 835 goto lowsurogate; 836 } 837 838 while (mySource < sourceLimit && myTarget < targetLimit) 839 { 840 ch = *(mySource++); 841 842 if (U16_IS_SURROGATE(ch)) { 843 if (U16_IS_LEAD(ch)) 844 { 845 lowsurogate: 846 if (mySource < sourceLimit) 847 { 848 ch2 = *mySource; 849 if (U16_IS_TRAIL(ch2)) 850 { 851 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; 852 mySource++; 853 } 854 else { 855 /* this is an unmatched trail code unit (2nd surrogate) */ 856 /* callback(illegal) */ 857 args->converter->fromUChar32 = ch; 858 *err = U_ILLEGAL_CHAR_FOUND; 859 break; 860 } 861 } 862 else { 863 /* ran out of source */ 864 args->converter->fromUChar32 = ch; 865 if (args->flush) { 866 /* this is an unmatched trail code unit (2nd surrogate) */ 867 /* callback(illegal) */ 868 *err = U_ILLEGAL_CHAR_FOUND; 869 } 870 break; 871 } 872 } 873 else { 874 /* this is an unmatched trail code unit (2nd surrogate) */ 875 /* callback(illegal) */ 876 args->converter->fromUChar32 = ch; 877 *err = U_ILLEGAL_CHAR_FOUND; 878 break; 879 } 880 } 881 882 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 883 temp[2] = (uint8_t) (ch >> 16 & 0x1F); 884 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ 885 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ 886 887 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) 888 { 889 if (myTarget < targetLimit) 890 { 891 *(myTarget++) = temp[indexToWrite]; 892 *(myOffsets++) = offsetNum; 893 } 894 else 895 { 896 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; 897 *err = U_BUFFER_OVERFLOW_ERROR; 898 } 899 } 900 offsetNum = offsetNum + 1 + (temp[2] != 0); 901 } 902 903 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 904 { 905 *err = U_BUFFER_OVERFLOW_ERROR; 906 } 907 908 args->target = (char *) myTarget; 909 args->source = mySource; 910 args->offsets = myOffsets; 911 } 912 913 static UChar32 914 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args, 915 UErrorCode* err) 916 { 917 const uint8_t *mySource; 918 UChar32 myUChar; 919 int32_t length; 920 921 mySource = (const uint8_t *)args->source; 922 if (mySource >= (const uint8_t *)args->sourceLimit) 923 { 924 /* no input */ 925 *err = U_INDEX_OUTOFBOUNDS_ERROR; 926 return 0xffff; 927 } 928 929 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource); 930 if (length < 4) 931 { 932 /* got a partial character */ 933 uprv_memcpy(args->converter->toUBytes, mySource, length); 934 args->converter->toULength = (int8_t)length; 935 args->source = (const char *)(mySource + length); 936 *err = U_TRUNCATED_CHAR_FOUND; 937 return 0xffff; 938 } 939 940 /* Don't even try to do a direct cast because the value may be on an odd address. */ 941 myUChar = ((UChar32)mySource[3] << 24) 942 | ((UChar32)mySource[2] << 16) 943 | ((UChar32)mySource[1] << 8) 944 | ((UChar32)mySource[0]); 945 946 args->source = (const char *)(mySource + 4); 947 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) { 948 return myUChar; 949 } 950 951 uprv_memcpy(args->converter->toUBytes, mySource, 4); 952 args->converter->toULength = 4; 953 954 *err = U_ILLEGAL_CHAR_FOUND; 955 return 0xffff; 956 } 957 958 static const UConverterImpl _UTF32LEImpl = { 959 UCNV_UTF32_LittleEndian, 960 961 NULL, 962 NULL, 963 964 NULL, 965 NULL, 966 NULL, 967 968 T_UConverter_toUnicode_UTF32_LE, 969 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC, 970 T_UConverter_fromUnicode_UTF32_LE, 971 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC, 972 T_UConverter_getNextUChar_UTF32_LE, 973 974 NULL, 975 NULL, 976 NULL, 977 NULL, 978 ucnv_getNonSurrogateUnicodeSet 979 }; 980 981 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */ 982 static const UConverterStaticData _UTF32LEStaticData = { 983 sizeof(UConverterStaticData), 984 "UTF-32LE", 985 1234, 986 UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4, 987 { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE, 988 0, 989 0, 990 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 991 }; 992 993 994 const UConverterSharedData _UTF32LEData = { 995 sizeof(UConverterSharedData), ~((uint32_t) 0), 996 NULL, NULL, &_UTF32LEStaticData, FALSE, &_UTF32LEImpl, 997 0 998 }; 999 1000 /* UTF-32 (Detect BOM) ------------------------------------------------------ */ 1001 1002 /* 1003 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE 1004 * accordingly. 1005 * 1006 * State values: 1007 * 0 initial state 1008 * 1 saw 00 1009 * 2 saw 00 00 1010 * 3 saw 00 00 FE 1011 * 4 - 1012 * 5 saw FF 1013 * 6 saw FF FE 1014 * 7 saw FF FE 00 1015 * 8 UTF-32BE mode 1016 * 9 UTF-32LE mode 1017 * 1018 * During detection: state&3==number of matching bytes so far. 1019 * 1020 * On output, emit U+FEFF as the first code point. 1021 */ 1022 1023 static void 1024 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) { 1025 if(choice<=UCNV_RESET_TO_UNICODE) { 1026 /* reset toUnicode: state=0 */ 1027 cnv->mode=0; 1028 } 1029 if(choice!=UCNV_RESET_TO_UNICODE) { 1030 /* reset fromUnicode: prepare to output the UTF-32PE BOM */ 1031 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; 1032 } 1033 } 1034 1035 static void 1036 _UTF32Open(UConverter *cnv, 1037 UConverterLoadArgs *pArgs, 1038 UErrorCode *pErrorCode) { 1039 _UTF32Reset(cnv, UCNV_RESET_BOTH); 1040 } 1041 1042 static const char utf32BOM[8]={ 0, 0, (char)0xfe, (char)0xff, (char)0xff, (char)0xfe, 0, 0 }; 1043 1044 static void 1045 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1046 UErrorCode *pErrorCode) { 1047 UConverter *cnv=pArgs->converter; 1048 const char *source=pArgs->source; 1049 const char *sourceLimit=pArgs->sourceLimit; 1050 int32_t *offsets=pArgs->offsets; 1051 1052 int32_t state, offsetDelta; 1053 char b; 1054 1055 state=cnv->mode; 1056 1057 /* 1058 * If we detect a BOM in this buffer, then we must add the BOM size to the 1059 * offsets because the actual converter function will not see and count the BOM. 1060 * offsetDelta will have the number of the BOM bytes that are in the current buffer. 1061 */ 1062 offsetDelta=0; 1063 1064 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { 1065 switch(state) { 1066 case 0: 1067 b=*source; 1068 if(b==0) { 1069 state=1; /* could be 00 00 FE FF */ 1070 } else if(b==(char)0xff) { 1071 state=5; /* could be FF FE 00 00 */ 1072 } else { 1073 state=8; /* default to UTF-32BE */ 1074 continue; 1075 } 1076 ++source; 1077 break; 1078 case 1: 1079 case 2: 1080 case 3: 1081 case 5: 1082 case 6: 1083 case 7: 1084 if(*source==utf32BOM[state]) { 1085 ++state; 1086 ++source; 1087 if(state==4) { 1088 state=8; /* detect UTF-32BE */ 1089 offsetDelta=(int32_t)(source-pArgs->source); 1090 } else if(state==8) { 1091 state=9; /* detect UTF-32LE */ 1092 offsetDelta=(int32_t)(source-pArgs->source); 1093 } 1094 } else { 1095 /* switch to UTF-32BE and pass the previous bytes */ 1096 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */ 1097 1098 /* reset the source */ 1099 source=pArgs->source; 1100 1101 if(count==(state&3)) { 1102 /* simple: all in the same buffer, just reset source */ 1103 } else { 1104 UBool oldFlush=pArgs->flush; 1105 1106 /* some of the bytes are from a previous buffer, replay those first */ 1107 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */ 1108 pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */ 1109 pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */ 1110 1111 /* no offsets: bytes from previous buffer, and not enough for output */ 1112 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); 1113 1114 /* restore real pointers; pArgs->source will be set in case 8/9 */ 1115 pArgs->sourceLimit=sourceLimit; 1116 pArgs->flush=oldFlush; 1117 } 1118 state=8; 1119 continue; 1120 } 1121 break; 1122 case 8: 1123 /* call UTF-32BE */ 1124 pArgs->source=source; 1125 if(offsets==NULL) { 1126 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); 1127 } else { 1128 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode); 1129 } 1130 source=pArgs->source; 1131 break; 1132 case 9: 1133 /* call UTF-32LE */ 1134 pArgs->source=source; 1135 if(offsets==NULL) { 1136 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode); 1137 } else { 1138 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode); 1139 } 1140 source=pArgs->source; 1141 break; 1142 default: 1143 break; /* does not occur */ 1144 } 1145 } 1146 1147 /* add BOM size to offsets - see comment at offsetDelta declaration */ 1148 if(offsets!=NULL && offsetDelta!=0) { 1149 int32_t *offsetsLimit=pArgs->offsets; 1150 while(offsets<offsetsLimit) { 1151 *offsets++ += offsetDelta; 1152 } 1153 } 1154 1155 pArgs->source=source; 1156 1157 if(source==sourceLimit && pArgs->flush) { 1158 /* handle truncated input */ 1159 switch(state) { 1160 case 0: 1161 break; /* no input at all, nothing to do */ 1162 case 8: 1163 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); 1164 break; 1165 case 9: 1166 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode); 1167 break; 1168 default: 1169 /* handle 0<state<8: call UTF-32BE with too-short input */ 1170 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */ 1171 pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */ 1172 1173 /* no offsets: not enough for output */ 1174 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); 1175 pArgs->source=source; 1176 pArgs->sourceLimit=sourceLimit; 1177 state=8; 1178 break; 1179 } 1180 } 1181 1182 cnv->mode=state; 1183 } 1184 1185 static UChar32 1186 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs, 1187 UErrorCode *pErrorCode) { 1188 switch(pArgs->converter->mode) { 1189 case 8: 1190 return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode); 1191 case 9: 1192 return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode); 1193 default: 1194 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 1195 } 1196 } 1197 1198 static const UConverterImpl _UTF32Impl = { 1199 UCNV_UTF32, 1200 1201 NULL, 1202 NULL, 1203 1204 _UTF32Open, 1205 NULL, 1206 _UTF32Reset, 1207 1208 _UTF32ToUnicodeWithOffsets, 1209 _UTF32ToUnicodeWithOffsets, 1210 #if U_IS_BIG_ENDIAN 1211 T_UConverter_fromUnicode_UTF32_BE, 1212 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC, 1213 #else 1214 T_UConverter_fromUnicode_UTF32_LE, 1215 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC, 1216 #endif 1217 _UTF32GetNextUChar, 1218 1219 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ 1220 NULL, 1221 NULL, 1222 NULL, 1223 ucnv_getNonSurrogateUnicodeSet 1224 }; 1225 1226 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */ 1227 static const UConverterStaticData _UTF32StaticData = { 1228 sizeof(UConverterStaticData), 1229 "UTF-32", 1230 1236, 1231 UCNV_IBM, UCNV_UTF32, 4, 4, 1232 #if U_IS_BIG_ENDIAN 1233 { 0, 0, 0xff, 0xfd }, 4, 1234 #else 1235 { 0xfd, 0xff, 0, 0 }, 4, 1236 #endif 1237 FALSE, FALSE, 1238 0, 1239 0, 1240 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1241 }; 1242 1243 const UConverterSharedData _UTF32Data = { 1244 sizeof(UConverterSharedData), ~((uint32_t) 0), 1245 NULL, NULL, &_UTF32StaticData, FALSE, &_UTF32Impl, 1246 0 1247 }; 1248 1249 #endif 1250