1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2002-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * file name: ucnv_u32.c 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002jul01 14 * created by: Markus W. Scherer 15 * 16 * UTF-32 converter implementation. Used to be in ucnv_utf.c. 17 */ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION 22 23 #include "unicode/ucnv.h" 24 #include "unicode/utf.h" 25 #include "ucnv_bld.h" 26 #include "ucnv_cnv.h" 27 #include "cmemory.h" 28 29 #define MAXIMUM_UCS2 0x0000FFFF 30 #define MAXIMUM_UTF 0x0010FFFF 31 #define HALF_SHIFT 10 32 #define HALF_BASE 0x0010000 33 #define HALF_MASK 0x3FF 34 #define SURROGATE_HIGH_START 0xD800 35 #define SURROGATE_LOW_START 0xDC00 36 37 /* -SURROGATE_LOW_START + HALF_BASE */ 38 #define SURROGATE_LOW_BASE 9216 39 40 enum { 41 UCNV_NEED_TO_WRITE_BOM=1 42 }; 43 44 /* UTF-32BE ----------------------------------------------------------------- */ 45 U_CDECL_BEGIN 46 static void U_CALLCONV 47 T_UConverter_toUnicode_UTF32_BE(UConverterToUnicodeArgs * args, 48 UErrorCode * err) 49 { 50 const unsigned char *mySource = (unsigned char *) args->source; 51 UChar *myTarget = args->target; 52 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 53 const UChar *targetLimit = args->targetLimit; 54 unsigned char *toUBytes = args->converter->toUBytes; 55 uint32_t ch, i; 56 57 /* Restore state of current sequence */ 58 if (args->converter->toULength > 0 && myTarget < targetLimit) { 59 i = args->converter->toULength; /* restore # of bytes consumed */ 60 args->converter->toULength = 0; 61 62 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/ 63 args->converter->toUnicodeStatus = 0; 64 goto morebytes; 65 } 66 67 while (mySource < sourceLimit && myTarget < targetLimit) { 68 i = 0; 69 ch = 0; 70 morebytes: 71 while (i < sizeof(uint32_t)) { 72 if (mySource < sourceLimit) { 73 ch = (ch << 8) | (uint8_t)(*mySource); 74 toUBytes[i++] = (char) *(mySource++); 75 } 76 else { 77 /* stores a partially calculated target*/ 78 /* + 1 to make 0 a valid character */ 79 args->converter->toUnicodeStatus = ch + 1; 80 args->converter->toULength = (int8_t) i; 81 goto donefornow; 82 } 83 } 84 85 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { 86 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 87 if (ch <= MAXIMUM_UCS2) 88 { 89 /* fits in 16 bits */ 90 *(myTarget++) = (UChar) ch; 91 } 92 else { 93 /* write out the surrogates */ 94 *(myTarget++) = U16_LEAD(ch); 95 ch = U16_TRAIL(ch); 96 if (myTarget < targetLimit) { 97 *(myTarget++) = (UChar)ch; 98 } 99 else { 100 /* Put in overflow buffer (not handled here) */ 101 args->converter->UCharErrorBuffer[0] = (UChar) ch; 102 args->converter->UCharErrorBufferLength = 1; 103 *err = U_BUFFER_OVERFLOW_ERROR; 104 break; 105 } 106 } 107 } 108 else { 109 args->converter->toULength = (int8_t)i; 110 *err = U_ILLEGAL_CHAR_FOUND; 111 break; 112 } 113 } 114 115 donefornow: 116 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { 117 /* End of target buffer */ 118 *err = U_BUFFER_OVERFLOW_ERROR; 119 } 120 121 args->target = myTarget; 122 args->source = (const char *) mySource; 123 } 124 125 static void U_CALLCONV 126 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(UConverterToUnicodeArgs * args, 127 UErrorCode * err) 128 { 129 const unsigned char *mySource = (unsigned char *) args->source; 130 UChar *myTarget = args->target; 131 int32_t *myOffsets = args->offsets; 132 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 133 const UChar *targetLimit = args->targetLimit; 134 unsigned char *toUBytes = args->converter->toUBytes; 135 uint32_t ch, i; 136 int32_t offsetNum = 0; 137 138 /* Restore state of current sequence */ 139 if (args->converter->toULength > 0 && myTarget < targetLimit) { 140 i = args->converter->toULength; /* restore # of bytes consumed */ 141 args->converter->toULength = 0; 142 143 ch = args->converter->toUnicodeStatus - 1;/*Stores the previously calculated ch from a previous call*/ 144 args->converter->toUnicodeStatus = 0; 145 goto morebytes; 146 } 147 148 while (mySource < sourceLimit && myTarget < targetLimit) { 149 i = 0; 150 ch = 0; 151 morebytes: 152 while (i < sizeof(uint32_t)) { 153 if (mySource < sourceLimit) { 154 ch = (ch << 8) | (uint8_t)(*mySource); 155 toUBytes[i++] = (char) *(mySource++); 156 } 157 else { 158 /* stores a partially calculated target*/ 159 /* + 1 to make 0 a valid character */ 160 args->converter->toUnicodeStatus = ch + 1; 161 args->converter->toULength = (int8_t) i; 162 goto donefornow; 163 } 164 } 165 166 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { 167 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 168 if (ch <= MAXIMUM_UCS2) { 169 /* fits in 16 bits */ 170 *(myTarget++) = (UChar) ch; 171 *(myOffsets++) = offsetNum; 172 } 173 else { 174 /* write out the surrogates */ 175 *(myTarget++) = U16_LEAD(ch); 176 *myOffsets++ = offsetNum; 177 ch = U16_TRAIL(ch); 178 if (myTarget < targetLimit) 179 { 180 *(myTarget++) = (UChar)ch; 181 *(myOffsets++) = offsetNum; 182 } 183 else { 184 /* Put in overflow buffer (not handled here) */ 185 args->converter->UCharErrorBuffer[0] = (UChar) ch; 186 args->converter->UCharErrorBufferLength = 1; 187 *err = U_BUFFER_OVERFLOW_ERROR; 188 break; 189 } 190 } 191 } 192 else { 193 args->converter->toULength = (int8_t)i; 194 *err = U_ILLEGAL_CHAR_FOUND; 195 break; 196 } 197 offsetNum += i; 198 } 199 200 donefornow: 201 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 202 { 203 /* End of target buffer */ 204 *err = U_BUFFER_OVERFLOW_ERROR; 205 } 206 207 args->target = myTarget; 208 args->source = (const char *) mySource; 209 args->offsets = myOffsets; 210 } 211 212 static void U_CALLCONV 213 T_UConverter_fromUnicode_UTF32_BE(UConverterFromUnicodeArgs * args, 214 UErrorCode * err) 215 { 216 const UChar *mySource = args->source; 217 unsigned char *myTarget; 218 const UChar *sourceLimit = args->sourceLimit; 219 const unsigned char *targetLimit = (unsigned char *) args->targetLimit; 220 UChar32 ch, ch2; 221 unsigned int indexToWrite; 222 unsigned char temp[sizeof(uint32_t)]; 223 224 if(mySource >= sourceLimit) { 225 /* no input, nothing to do */ 226 return; 227 } 228 229 /* write the BOM if necessary */ 230 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 231 static const char bom[]={ 0, 0, (char)0xfeu, (char)0xffu }; 232 ucnv_fromUWriteBytes(args->converter, 233 bom, 4, 234 &args->target, args->targetLimit, 235 &args->offsets, -1, 236 err); 237 args->converter->fromUnicodeStatus=0; 238 } 239 240 myTarget = (unsigned char *) args->target; 241 temp[0] = 0; 242 243 if (args->converter->fromUChar32) { 244 ch = args->converter->fromUChar32; 245 args->converter->fromUChar32 = 0; 246 goto lowsurogate; 247 } 248 249 while (mySource < sourceLimit && myTarget < targetLimit) { 250 ch = *(mySource++); 251 252 if (U_IS_SURROGATE(ch)) { 253 if (U_IS_LEAD(ch)) { 254 lowsurogate: 255 if (mySource < sourceLimit) { 256 ch2 = *mySource; 257 if (U_IS_TRAIL(ch2)) { 258 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; 259 mySource++; 260 } 261 else { 262 /* this is an unmatched trail code unit (2nd surrogate) */ 263 /* callback(illegal) */ 264 args->converter->fromUChar32 = ch; 265 *err = U_ILLEGAL_CHAR_FOUND; 266 break; 267 } 268 } 269 else { 270 /* ran out of source */ 271 args->converter->fromUChar32 = ch; 272 if (args->flush) { 273 /* this is an unmatched trail code unit (2nd surrogate) */ 274 /* callback(illegal) */ 275 *err = U_ILLEGAL_CHAR_FOUND; 276 } 277 break; 278 } 279 } 280 else { 281 /* this is an unmatched trail code unit (2nd surrogate) */ 282 /* callback(illegal) */ 283 args->converter->fromUChar32 = ch; 284 *err = U_ILLEGAL_CHAR_FOUND; 285 break; 286 } 287 } 288 289 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 290 temp[1] = (uint8_t) (ch >> 16 & 0x1F); 291 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ 292 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ 293 294 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) { 295 if (myTarget < targetLimit) { 296 *(myTarget++) = temp[indexToWrite]; 297 } 298 else { 299 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; 300 *err = U_BUFFER_OVERFLOW_ERROR; 301 } 302 } 303 } 304 305 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { 306 *err = U_BUFFER_OVERFLOW_ERROR; 307 } 308 309 args->target = (char *) myTarget; 310 args->source = mySource; 311 } 312 313 static void U_CALLCONV 314 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args, 315 UErrorCode * err) 316 { 317 const UChar *mySource = args->source; 318 unsigned char *myTarget; 319 int32_t *myOffsets; 320 const UChar *sourceLimit = args->sourceLimit; 321 const unsigned char *targetLimit = (unsigned char *) args->targetLimit; 322 UChar32 ch, ch2; 323 int32_t offsetNum = 0; 324 unsigned int indexToWrite; 325 unsigned char temp[sizeof(uint32_t)]; 326 327 if(mySource >= sourceLimit) { 328 /* no input, nothing to do */ 329 return; 330 } 331 332 /* write the BOM if necessary */ 333 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 334 static const char bom[]={ 0, 0, (char)0xfeu, (char)0xffu }; 335 ucnv_fromUWriteBytes(args->converter, 336 bom, 4, 337 &args->target, args->targetLimit, 338 &args->offsets, -1, 339 err); 340 args->converter->fromUnicodeStatus=0; 341 } 342 343 myTarget = (unsigned char *) args->target; 344 myOffsets = args->offsets; 345 temp[0] = 0; 346 347 if (args->converter->fromUChar32) { 348 ch = args->converter->fromUChar32; 349 args->converter->fromUChar32 = 0; 350 goto lowsurogate; 351 } 352 353 while (mySource < sourceLimit && myTarget < targetLimit) { 354 ch = *(mySource++); 355 356 if (U_IS_SURROGATE(ch)) { 357 if (U_IS_LEAD(ch)) { 358 lowsurogate: 359 if (mySource < sourceLimit) { 360 ch2 = *mySource; 361 if (U_IS_TRAIL(ch2)) { 362 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; 363 mySource++; 364 } 365 else { 366 /* this is an unmatched trail code unit (2nd surrogate) */ 367 /* callback(illegal) */ 368 args->converter->fromUChar32 = ch; 369 *err = U_ILLEGAL_CHAR_FOUND; 370 break; 371 } 372 } 373 else { 374 /* ran out of source */ 375 args->converter->fromUChar32 = ch; 376 if (args->flush) { 377 /* this is an unmatched trail code unit (2nd surrogate) */ 378 /* callback(illegal) */ 379 *err = U_ILLEGAL_CHAR_FOUND; 380 } 381 break; 382 } 383 } 384 else { 385 /* this is an unmatched trail code unit (2nd surrogate) */ 386 /* callback(illegal) */ 387 args->converter->fromUChar32 = ch; 388 *err = U_ILLEGAL_CHAR_FOUND; 389 break; 390 } 391 } 392 393 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 394 temp[1] = (uint8_t) (ch >> 16 & 0x1F); 395 temp[2] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ 396 temp[3] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ 397 398 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) { 399 if (myTarget < targetLimit) { 400 *(myTarget++) = temp[indexToWrite]; 401 *(myOffsets++) = offsetNum; 402 } 403 else { 404 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; 405 *err = U_BUFFER_OVERFLOW_ERROR; 406 } 407 } 408 offsetNum = offsetNum + 1 + (temp[1] != 0); 409 } 410 411 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) { 412 *err = U_BUFFER_OVERFLOW_ERROR; 413 } 414 415 args->target = (char *) myTarget; 416 args->source = mySource; 417 args->offsets = myOffsets; 418 } 419 420 static UChar32 U_CALLCONV 421 T_UConverter_getNextUChar_UTF32_BE(UConverterToUnicodeArgs* args, 422 UErrorCode* err) 423 { 424 const uint8_t *mySource; 425 UChar32 myUChar; 426 int32_t length; 427 428 mySource = (const uint8_t *)args->source; 429 if (mySource >= (const uint8_t *)args->sourceLimit) 430 { 431 /* no input */ 432 *err = U_INDEX_OUTOFBOUNDS_ERROR; 433 return 0xffff; 434 } 435 436 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource); 437 if (length < 4) 438 { 439 /* got a partial character */ 440 uprv_memcpy(args->converter->toUBytes, mySource, length); 441 args->converter->toULength = (int8_t)length; 442 args->source = (const char *)(mySource + length); 443 *err = U_TRUNCATED_CHAR_FOUND; 444 return 0xffff; 445 } 446 447 /* Don't even try to do a direct cast because the value may be on an odd address. */ 448 myUChar = ((UChar32)mySource[0] << 24) 449 | ((UChar32)mySource[1] << 16) 450 | ((UChar32)mySource[2] << 8) 451 | ((UChar32)mySource[3]); 452 453 args->source = (const char *)(mySource + 4); 454 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) { 455 return myUChar; 456 } 457 458 uprv_memcpy(args->converter->toUBytes, mySource, 4); 459 args->converter->toULength = 4; 460 461 *err = U_ILLEGAL_CHAR_FOUND; 462 return 0xffff; 463 } 464 U_CDECL_END 465 static const UConverterImpl _UTF32BEImpl = { 466 UCNV_UTF32_BigEndian, 467 468 NULL, 469 NULL, 470 471 NULL, 472 NULL, 473 NULL, 474 475 T_UConverter_toUnicode_UTF32_BE, 476 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC, 477 T_UConverter_fromUnicode_UTF32_BE, 478 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC, 479 T_UConverter_getNextUChar_UTF32_BE, 480 481 NULL, 482 NULL, 483 NULL, 484 NULL, 485 ucnv_getNonSurrogateUnicodeSet, 486 487 NULL, 488 NULL 489 }; 490 491 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */ 492 static const UConverterStaticData _UTF32BEStaticData = { 493 sizeof(UConverterStaticData), 494 "UTF-32BE", 495 1232, 496 UCNV_IBM, UCNV_UTF32_BigEndian, 4, 4, 497 { 0, 0, 0xff, 0xfd }, 4, FALSE, FALSE, 498 0, 499 0, 500 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 501 }; 502 503 const UConverterSharedData _UTF32BEData = 504 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32BEStaticData, &_UTF32BEImpl); 505 506 /* UTF-32LE ---------------------------------------------------------- */ 507 U_CDECL_BEGIN 508 static void U_CALLCONV 509 T_UConverter_toUnicode_UTF32_LE(UConverterToUnicodeArgs * args, 510 UErrorCode * err) 511 { 512 const unsigned char *mySource = (unsigned char *) args->source; 513 UChar *myTarget = args->target; 514 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 515 const UChar *targetLimit = args->targetLimit; 516 unsigned char *toUBytes = args->converter->toUBytes; 517 uint32_t ch, i; 518 519 /* Restore state of current sequence */ 520 if (args->converter->toULength > 0 && myTarget < targetLimit) 521 { 522 i = args->converter->toULength; /* restore # of bytes consumed */ 523 args->converter->toULength = 0; 524 525 /* Stores the previously calculated ch from a previous call*/ 526 ch = args->converter->toUnicodeStatus - 1; 527 args->converter->toUnicodeStatus = 0; 528 goto morebytes; 529 } 530 531 while (mySource < sourceLimit && myTarget < targetLimit) 532 { 533 i = 0; 534 ch = 0; 535 morebytes: 536 while (i < sizeof(uint32_t)) 537 { 538 if (mySource < sourceLimit) 539 { 540 ch |= ((uint8_t)(*mySource)) << (i * 8); 541 toUBytes[i++] = (char) *(mySource++); 542 } 543 else 544 { 545 /* stores a partially calculated target*/ 546 /* + 1 to make 0 a valid character */ 547 args->converter->toUnicodeStatus = ch + 1; 548 args->converter->toULength = (int8_t) i; 549 goto donefornow; 550 } 551 } 552 553 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) { 554 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 555 if (ch <= MAXIMUM_UCS2) { 556 /* fits in 16 bits */ 557 *(myTarget++) = (UChar) ch; 558 } 559 else { 560 /* write out the surrogates */ 561 *(myTarget++) = U16_LEAD(ch); 562 ch = U16_TRAIL(ch); 563 if (myTarget < targetLimit) { 564 *(myTarget++) = (UChar)ch; 565 } 566 else { 567 /* Put in overflow buffer (not handled here) */ 568 args->converter->UCharErrorBuffer[0] = (UChar) ch; 569 args->converter->UCharErrorBufferLength = 1; 570 *err = U_BUFFER_OVERFLOW_ERROR; 571 break; 572 } 573 } 574 } 575 else { 576 args->converter->toULength = (int8_t)i; 577 *err = U_ILLEGAL_CHAR_FOUND; 578 break; 579 } 580 } 581 582 donefornow: 583 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 584 { 585 /* End of target buffer */ 586 *err = U_BUFFER_OVERFLOW_ERROR; 587 } 588 589 args->target = myTarget; 590 args->source = (const char *) mySource; 591 } 592 593 static void U_CALLCONV 594 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(UConverterToUnicodeArgs * args, 595 UErrorCode * err) 596 { 597 const unsigned char *mySource = (unsigned char *) args->source; 598 UChar *myTarget = args->target; 599 int32_t *myOffsets = args->offsets; 600 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 601 const UChar *targetLimit = args->targetLimit; 602 unsigned char *toUBytes = args->converter->toUBytes; 603 uint32_t ch, i; 604 int32_t offsetNum = 0; 605 606 /* Restore state of current sequence */ 607 if (args->converter->toULength > 0 && myTarget < targetLimit) 608 { 609 i = args->converter->toULength; /* restore # of bytes consumed */ 610 args->converter->toULength = 0; 611 612 /* Stores the previously calculated ch from a previous call*/ 613 ch = args->converter->toUnicodeStatus - 1; 614 args->converter->toUnicodeStatus = 0; 615 goto morebytes; 616 } 617 618 while (mySource < sourceLimit && myTarget < targetLimit) 619 { 620 i = 0; 621 ch = 0; 622 morebytes: 623 while (i < sizeof(uint32_t)) 624 { 625 if (mySource < sourceLimit) 626 { 627 ch |= ((uint8_t)(*mySource)) << (i * 8); 628 toUBytes[i++] = (char) *(mySource++); 629 } 630 else 631 { 632 /* stores a partially calculated target*/ 633 /* + 1 to make 0 a valid character */ 634 args->converter->toUnicodeStatus = ch + 1; 635 args->converter->toULength = (int8_t) i; 636 goto donefornow; 637 } 638 } 639 640 if (ch <= MAXIMUM_UTF && !U_IS_SURROGATE(ch)) 641 { 642 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 643 if (ch <= MAXIMUM_UCS2) 644 { 645 /* fits in 16 bits */ 646 *(myTarget++) = (UChar) ch; 647 *(myOffsets++) = offsetNum; 648 } 649 else { 650 /* write out the surrogates */ 651 *(myTarget++) = U16_LEAD(ch); 652 *(myOffsets++) = offsetNum; 653 ch = U16_TRAIL(ch); 654 if (myTarget < targetLimit) 655 { 656 *(myTarget++) = (UChar)ch; 657 *(myOffsets++) = offsetNum; 658 } 659 else 660 { 661 /* Put in overflow buffer (not handled here) */ 662 args->converter->UCharErrorBuffer[0] = (UChar) ch; 663 args->converter->UCharErrorBufferLength = 1; 664 *err = U_BUFFER_OVERFLOW_ERROR; 665 break; 666 } 667 } 668 } 669 else 670 { 671 args->converter->toULength = (int8_t)i; 672 *err = U_ILLEGAL_CHAR_FOUND; 673 break; 674 } 675 offsetNum += i; 676 } 677 678 donefornow: 679 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 680 { 681 /* End of target buffer */ 682 *err = U_BUFFER_OVERFLOW_ERROR; 683 } 684 685 args->target = myTarget; 686 args->source = (const char *) mySource; 687 args->offsets = myOffsets; 688 } 689 690 static void U_CALLCONV 691 T_UConverter_fromUnicode_UTF32_LE(UConverterFromUnicodeArgs * args, 692 UErrorCode * err) 693 { 694 const UChar *mySource = args->source; 695 unsigned char *myTarget; 696 const UChar *sourceLimit = args->sourceLimit; 697 const unsigned char *targetLimit = (unsigned char *) args->targetLimit; 698 UChar32 ch, ch2; 699 unsigned int indexToWrite; 700 unsigned char temp[sizeof(uint32_t)]; 701 702 if(mySource >= sourceLimit) { 703 /* no input, nothing to do */ 704 return; 705 } 706 707 /* write the BOM if necessary */ 708 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 709 static const char bom[]={ (char)0xffu, (char)0xfeu, 0, 0 }; 710 ucnv_fromUWriteBytes(args->converter, 711 bom, 4, 712 &args->target, args->targetLimit, 713 &args->offsets, -1, 714 err); 715 args->converter->fromUnicodeStatus=0; 716 } 717 718 myTarget = (unsigned char *) args->target; 719 temp[3] = 0; 720 721 if (args->converter->fromUChar32) 722 { 723 ch = args->converter->fromUChar32; 724 args->converter->fromUChar32 = 0; 725 goto lowsurogate; 726 } 727 728 while (mySource < sourceLimit && myTarget < targetLimit) 729 { 730 ch = *(mySource++); 731 732 if (U16_IS_SURROGATE(ch)) { 733 if (U16_IS_LEAD(ch)) 734 { 735 lowsurogate: 736 if (mySource < sourceLimit) 737 { 738 ch2 = *mySource; 739 if (U16_IS_TRAIL(ch2)) { 740 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; 741 mySource++; 742 } 743 else { 744 /* this is an unmatched trail code unit (2nd surrogate) */ 745 /* callback(illegal) */ 746 args->converter->fromUChar32 = ch; 747 *err = U_ILLEGAL_CHAR_FOUND; 748 break; 749 } 750 } 751 else { 752 /* ran out of source */ 753 args->converter->fromUChar32 = ch; 754 if (args->flush) { 755 /* this is an unmatched trail code unit (2nd surrogate) */ 756 /* callback(illegal) */ 757 *err = U_ILLEGAL_CHAR_FOUND; 758 } 759 break; 760 } 761 } 762 else { 763 /* this is an unmatched trail code unit (2nd surrogate) */ 764 /* callback(illegal) */ 765 args->converter->fromUChar32 = ch; 766 *err = U_ILLEGAL_CHAR_FOUND; 767 break; 768 } 769 } 770 771 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 772 temp[2] = (uint8_t) (ch >> 16 & 0x1F); 773 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ 774 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ 775 776 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) 777 { 778 if (myTarget < targetLimit) 779 { 780 *(myTarget++) = temp[indexToWrite]; 781 } 782 else 783 { 784 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; 785 *err = U_BUFFER_OVERFLOW_ERROR; 786 } 787 } 788 } 789 790 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 791 { 792 *err = U_BUFFER_OVERFLOW_ERROR; 793 } 794 795 args->target = (char *) myTarget; 796 args->source = mySource; 797 } 798 799 static void U_CALLCONV 800 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC(UConverterFromUnicodeArgs * args, 801 UErrorCode * err) 802 { 803 const UChar *mySource = args->source; 804 unsigned char *myTarget; 805 int32_t *myOffsets; 806 const UChar *sourceLimit = args->sourceLimit; 807 const unsigned char *targetLimit = (unsigned char *) args->targetLimit; 808 UChar32 ch, ch2; 809 unsigned int indexToWrite; 810 unsigned char temp[sizeof(uint32_t)]; 811 int32_t offsetNum = 0; 812 813 if(mySource >= sourceLimit) { 814 /* no input, nothing to do */ 815 return; 816 } 817 818 /* write the BOM if necessary */ 819 if(args->converter->fromUnicodeStatus==UCNV_NEED_TO_WRITE_BOM) { 820 static const char bom[]={ (char)0xffu, (char)0xfeu, 0, 0 }; 821 ucnv_fromUWriteBytes(args->converter, 822 bom, 4, 823 &args->target, args->targetLimit, 824 &args->offsets, -1, 825 err); 826 args->converter->fromUnicodeStatus=0; 827 } 828 829 myTarget = (unsigned char *) args->target; 830 myOffsets = args->offsets; 831 temp[3] = 0; 832 833 if (args->converter->fromUChar32) 834 { 835 ch = args->converter->fromUChar32; 836 args->converter->fromUChar32 = 0; 837 goto lowsurogate; 838 } 839 840 while (mySource < sourceLimit && myTarget < targetLimit) 841 { 842 ch = *(mySource++); 843 844 if (U16_IS_SURROGATE(ch)) { 845 if (U16_IS_LEAD(ch)) 846 { 847 lowsurogate: 848 if (mySource < sourceLimit) 849 { 850 ch2 = *mySource; 851 if (U16_IS_TRAIL(ch2)) 852 { 853 ch = ((ch - SURROGATE_HIGH_START) << HALF_SHIFT) + ch2 + SURROGATE_LOW_BASE; 854 mySource++; 855 } 856 else { 857 /* this is an unmatched trail code unit (2nd surrogate) */ 858 /* callback(illegal) */ 859 args->converter->fromUChar32 = ch; 860 *err = U_ILLEGAL_CHAR_FOUND; 861 break; 862 } 863 } 864 else { 865 /* ran out of source */ 866 args->converter->fromUChar32 = ch; 867 if (args->flush) { 868 /* this is an unmatched trail code unit (2nd surrogate) */ 869 /* callback(illegal) */ 870 *err = U_ILLEGAL_CHAR_FOUND; 871 } 872 break; 873 } 874 } 875 else { 876 /* this is an unmatched trail code unit (2nd surrogate) */ 877 /* callback(illegal) */ 878 args->converter->fromUChar32 = ch; 879 *err = U_ILLEGAL_CHAR_FOUND; 880 break; 881 } 882 } 883 884 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 885 temp[2] = (uint8_t) (ch >> 16 & 0x1F); 886 temp[1] = (uint8_t) (ch >> 8); /* unsigned cast implicitly does (ch & FF) */ 887 temp[0] = (uint8_t) (ch); /* unsigned cast implicitly does (ch & FF) */ 888 889 for (indexToWrite = 0; indexToWrite <= sizeof(uint32_t) - 1; indexToWrite++) 890 { 891 if (myTarget < targetLimit) 892 { 893 *(myTarget++) = temp[indexToWrite]; 894 *(myOffsets++) = offsetNum; 895 } 896 else 897 { 898 args->converter->charErrorBuffer[args->converter->charErrorBufferLength++] = temp[indexToWrite]; 899 *err = U_BUFFER_OVERFLOW_ERROR; 900 } 901 } 902 offsetNum = offsetNum + 1 + (temp[2] != 0); 903 } 904 905 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 906 { 907 *err = U_BUFFER_OVERFLOW_ERROR; 908 } 909 910 args->target = (char *) myTarget; 911 args->source = mySource; 912 args->offsets = myOffsets; 913 } 914 915 static UChar32 U_CALLCONV 916 T_UConverter_getNextUChar_UTF32_LE(UConverterToUnicodeArgs* args, 917 UErrorCode* err) 918 { 919 const uint8_t *mySource; 920 UChar32 myUChar; 921 int32_t length; 922 923 mySource = (const uint8_t *)args->source; 924 if (mySource >= (const uint8_t *)args->sourceLimit) 925 { 926 /* no input */ 927 *err = U_INDEX_OUTOFBOUNDS_ERROR; 928 return 0xffff; 929 } 930 931 length = (int32_t)((const uint8_t *)args->sourceLimit - mySource); 932 if (length < 4) 933 { 934 /* got a partial character */ 935 uprv_memcpy(args->converter->toUBytes, mySource, length); 936 args->converter->toULength = (int8_t)length; 937 args->source = (const char *)(mySource + length); 938 *err = U_TRUNCATED_CHAR_FOUND; 939 return 0xffff; 940 } 941 942 /* Don't even try to do a direct cast because the value may be on an odd address. */ 943 myUChar = ((UChar32)mySource[3] << 24) 944 | ((UChar32)mySource[2] << 16) 945 | ((UChar32)mySource[1] << 8) 946 | ((UChar32)mySource[0]); 947 948 args->source = (const char *)(mySource + 4); 949 if ((uint32_t)myUChar <= MAXIMUM_UTF && !U_IS_SURROGATE(myUChar)) { 950 return myUChar; 951 } 952 953 uprv_memcpy(args->converter->toUBytes, mySource, 4); 954 args->converter->toULength = 4; 955 956 *err = U_ILLEGAL_CHAR_FOUND; 957 return 0xffff; 958 } 959 U_CDECL_END 960 static const UConverterImpl _UTF32LEImpl = { 961 UCNV_UTF32_LittleEndian, 962 963 NULL, 964 NULL, 965 966 NULL, 967 NULL, 968 NULL, 969 970 T_UConverter_toUnicode_UTF32_LE, 971 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC, 972 T_UConverter_fromUnicode_UTF32_LE, 973 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC, 974 T_UConverter_getNextUChar_UTF32_LE, 975 976 NULL, 977 NULL, 978 NULL, 979 NULL, 980 ucnv_getNonSurrogateUnicodeSet, 981 982 NULL, 983 NULL 984 }; 985 986 /* The 1232 CCSID refers to any version of Unicode with any endianess of UTF-32 */ 987 static const UConverterStaticData _UTF32LEStaticData = { 988 sizeof(UConverterStaticData), 989 "UTF-32LE", 990 1234, 991 UCNV_IBM, UCNV_UTF32_LittleEndian, 4, 4, 992 { 0xfd, 0xff, 0, 0 }, 4, FALSE, FALSE, 993 0, 994 0, 995 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 996 }; 997 998 999 const UConverterSharedData _UTF32LEData = 1000 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32LEStaticData, &_UTF32LEImpl); 1001 1002 /* UTF-32 (Detect BOM) ------------------------------------------------------ */ 1003 1004 /* 1005 * Detect a BOM at the beginning of the stream and select UTF-32BE or UTF-32LE 1006 * accordingly. 1007 * 1008 * State values: 1009 * 0 initial state 1010 * 1 saw 00 1011 * 2 saw 00 00 1012 * 3 saw 00 00 FE 1013 * 4 - 1014 * 5 saw FF 1015 * 6 saw FF FE 1016 * 7 saw FF FE 00 1017 * 8 UTF-32BE mode 1018 * 9 UTF-32LE mode 1019 * 1020 * During detection: state&3==number of matching bytes so far. 1021 * 1022 * On output, emit U+FEFF as the first code point. 1023 */ 1024 U_CDECL_BEGIN 1025 static void U_CALLCONV 1026 _UTF32Reset(UConverter *cnv, UConverterResetChoice choice) { 1027 if(choice<=UCNV_RESET_TO_UNICODE) { 1028 /* reset toUnicode: state=0 */ 1029 cnv->mode=0; 1030 } 1031 if(choice!=UCNV_RESET_TO_UNICODE) { 1032 /* reset fromUnicode: prepare to output the UTF-32PE BOM */ 1033 cnv->fromUnicodeStatus=UCNV_NEED_TO_WRITE_BOM; 1034 } 1035 } 1036 1037 static void U_CALLCONV 1038 _UTF32Open(UConverter *cnv, 1039 UConverterLoadArgs *pArgs, 1040 UErrorCode *pErrorCode) { 1041 (void)pArgs; 1042 (void)pErrorCode; 1043 _UTF32Reset(cnv, UCNV_RESET_BOTH); 1044 } 1045 1046 static const char utf32BOM[8]={ 0, 0, (char)0xfeu, (char)0xffu, (char)0xffu, (char)0xfeu, 0, 0 }; 1047 1048 static void U_CALLCONV 1049 _UTF32ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, 1050 UErrorCode *pErrorCode) { 1051 UConverter *cnv=pArgs->converter; 1052 const char *source=pArgs->source; 1053 const char *sourceLimit=pArgs->sourceLimit; 1054 int32_t *offsets=pArgs->offsets; 1055 1056 int32_t state, offsetDelta; 1057 char b; 1058 1059 state=cnv->mode; 1060 1061 /* 1062 * If we detect a BOM in this buffer, then we must add the BOM size to the 1063 * offsets because the actual converter function will not see and count the BOM. 1064 * offsetDelta will have the number of the BOM bytes that are in the current buffer. 1065 */ 1066 offsetDelta=0; 1067 1068 while(source<sourceLimit && U_SUCCESS(*pErrorCode)) { 1069 switch(state) { 1070 case 0: 1071 b=*source; 1072 if(b==0) { 1073 state=1; /* could be 00 00 FE FF */ 1074 } else if(b==(char)0xffu) { 1075 state=5; /* could be FF FE 00 00 */ 1076 } else { 1077 state=8; /* default to UTF-32BE */ 1078 continue; 1079 } 1080 ++source; 1081 break; 1082 case 1: 1083 case 2: 1084 case 3: 1085 case 5: 1086 case 6: 1087 case 7: 1088 if(*source==utf32BOM[state]) { 1089 ++state; 1090 ++source; 1091 if(state==4) { 1092 state=8; /* detect UTF-32BE */ 1093 offsetDelta=(int32_t)(source-pArgs->source); 1094 } else if(state==8) { 1095 state=9; /* detect UTF-32LE */ 1096 offsetDelta=(int32_t)(source-pArgs->source); 1097 } 1098 } else { 1099 /* switch to UTF-32BE and pass the previous bytes */ 1100 int32_t count=(int32_t)(source-pArgs->source); /* number of bytes from this buffer */ 1101 1102 /* reset the source */ 1103 source=pArgs->source; 1104 1105 if(count==(state&3)) { 1106 /* simple: all in the same buffer, just reset source */ 1107 } else { 1108 UBool oldFlush=pArgs->flush; 1109 1110 /* some of the bytes are from a previous buffer, replay those first */ 1111 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */ 1112 pArgs->sourceLimit=pArgs->source+((state&3)-count); /* replay previous bytes */ 1113 pArgs->flush=FALSE; /* this sourceLimit is not the real source stream limit */ 1114 1115 /* no offsets: bytes from previous buffer, and not enough for output */ 1116 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); 1117 1118 /* restore real pointers; pArgs->source will be set in case 8/9 */ 1119 pArgs->sourceLimit=sourceLimit; 1120 pArgs->flush=oldFlush; 1121 } 1122 state=8; 1123 continue; 1124 } 1125 break; 1126 case 8: 1127 /* call UTF-32BE */ 1128 pArgs->source=source; 1129 if(offsets==NULL) { 1130 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); 1131 } else { 1132 T_UConverter_toUnicode_UTF32_BE_OFFSET_LOGIC(pArgs, pErrorCode); 1133 } 1134 source=pArgs->source; 1135 break; 1136 case 9: 1137 /* call UTF-32LE */ 1138 pArgs->source=source; 1139 if(offsets==NULL) { 1140 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode); 1141 } else { 1142 T_UConverter_toUnicode_UTF32_LE_OFFSET_LOGIC(pArgs, pErrorCode); 1143 } 1144 source=pArgs->source; 1145 break; 1146 default: 1147 break; /* does not occur */ 1148 } 1149 } 1150 1151 /* add BOM size to offsets - see comment at offsetDelta declaration */ 1152 if(offsets!=NULL && offsetDelta!=0) { 1153 int32_t *offsetsLimit=pArgs->offsets; 1154 while(offsets<offsetsLimit) { 1155 *offsets++ += offsetDelta; 1156 } 1157 } 1158 1159 pArgs->source=source; 1160 1161 if(source==sourceLimit && pArgs->flush) { 1162 /* handle truncated input */ 1163 switch(state) { 1164 case 0: 1165 break; /* no input at all, nothing to do */ 1166 case 8: 1167 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); 1168 break; 1169 case 9: 1170 T_UConverter_toUnicode_UTF32_LE(pArgs, pErrorCode); 1171 break; 1172 default: 1173 /* handle 0<state<8: call UTF-32BE with too-short input */ 1174 pArgs->source=utf32BOM+(state&4); /* select the correct BOM */ 1175 pArgs->sourceLimit=pArgs->source+(state&3); /* replay bytes */ 1176 1177 /* no offsets: not enough for output */ 1178 T_UConverter_toUnicode_UTF32_BE(pArgs, pErrorCode); 1179 pArgs->source=source; 1180 pArgs->sourceLimit=sourceLimit; 1181 state=8; 1182 break; 1183 } 1184 } 1185 1186 cnv->mode=state; 1187 } 1188 1189 static UChar32 U_CALLCONV 1190 _UTF32GetNextUChar(UConverterToUnicodeArgs *pArgs, 1191 UErrorCode *pErrorCode) { 1192 switch(pArgs->converter->mode) { 1193 case 8: 1194 return T_UConverter_getNextUChar_UTF32_BE(pArgs, pErrorCode); 1195 case 9: 1196 return T_UConverter_getNextUChar_UTF32_LE(pArgs, pErrorCode); 1197 default: 1198 return UCNV_GET_NEXT_UCHAR_USE_TO_U; 1199 } 1200 } 1201 U_CDECL_END 1202 static const UConverterImpl _UTF32Impl = { 1203 UCNV_UTF32, 1204 1205 NULL, 1206 NULL, 1207 1208 _UTF32Open, 1209 NULL, 1210 _UTF32Reset, 1211 1212 _UTF32ToUnicodeWithOffsets, 1213 _UTF32ToUnicodeWithOffsets, 1214 #if U_IS_BIG_ENDIAN 1215 T_UConverter_fromUnicode_UTF32_BE, 1216 T_UConverter_fromUnicode_UTF32_BE_OFFSET_LOGIC, 1217 #else 1218 T_UConverter_fromUnicode_UTF32_LE, 1219 T_UConverter_fromUnicode_UTF32_LE_OFFSET_LOGIC, 1220 #endif 1221 _UTF32GetNextUChar, 1222 1223 NULL, /* ### TODO implement getStarters for all Unicode encodings?! */ 1224 NULL, 1225 NULL, 1226 NULL, 1227 ucnv_getNonSurrogateUnicodeSet, 1228 1229 NULL, 1230 NULL 1231 }; 1232 1233 /* The 1236 CCSID refers to any version of Unicode with a BOM sensitive endianess of UTF-32 */ 1234 static const UConverterStaticData _UTF32StaticData = { 1235 sizeof(UConverterStaticData), 1236 "UTF-32", 1237 1236, 1238 UCNV_IBM, UCNV_UTF32, 4, 4, 1239 #if U_IS_BIG_ENDIAN 1240 { 0, 0, 0xff, 0xfd }, 4, 1241 #else 1242 { 0xfd, 0xff, 0, 0 }, 4, 1243 #endif 1244 FALSE, FALSE, 1245 0, 1246 0, 1247 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1248 }; 1249 1250 const UConverterSharedData _UTF32Data = 1251 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF32StaticData, &_UTF32Impl); 1252 1253 #endif 1254