1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2002-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * file name: ucnv_u8.c 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002jul01 14 * created by: Markus W. Scherer 15 * 16 * UTF-8 converter implementation. Used to be in ucnv_utf.c. 17 * 18 * Also, CESU-8 implementation, see UTR 26. 19 * The CESU-8 converter uses all the same functions as the 20 * UTF-8 converter, with a branch for converting supplementary code points. 21 */ 22 23 #include "unicode/utypes.h" 24 25 #if !UCONFIG_NO_CONVERSION 26 27 #include "unicode/ucnv.h" 28 #include "unicode/utf.h" 29 #include "unicode/utf8.h" 30 #include "unicode/utf16.h" 31 #include "uassert.h" 32 #include "ucnv_bld.h" 33 #include "ucnv_cnv.h" 34 #include "cmemory.h" 35 #include "ustr_imp.h" 36 37 /* Prototypes --------------------------------------------------------------- */ 38 39 /* Keep these here to make finicky compilers happy */ 40 41 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args, 42 UErrorCode *err); 43 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args, 44 UErrorCode *err); 45 46 47 /* UTF-8 -------------------------------------------------------------------- */ 48 49 #define MAXIMUM_UCS2 0x0000FFFF 50 51 static const uint32_t offsetsFromUTF8[5] = {0, 52 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080, 53 (uint32_t) 0x03C82080 54 }; 55 56 static UBool hasCESU8Data(const UConverter *cnv) 57 { 58 #if UCONFIG_ONLY_HTML_CONVERSION 59 return FALSE; 60 #else 61 return (UBool)(cnv->sharedData == &_CESU8Data); 62 #endif 63 } 64 U_CDECL_BEGIN 65 static void U_CALLCONV ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args, 66 UErrorCode * err) 67 { 68 UConverter *cnv = args->converter; 69 const unsigned char *mySource = (unsigned char *) args->source; 70 UChar *myTarget = args->target; 71 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 72 const UChar *targetLimit = args->targetLimit; 73 unsigned char *toUBytes = cnv->toUBytes; 74 UBool isCESU8 = hasCESU8Data(cnv); 75 uint32_t ch, ch2 = 0; 76 int32_t i, inBytes; 77 78 /* Restore size of current sequence */ 79 if (cnv->toUnicodeStatus && myTarget < targetLimit) 80 { 81 inBytes = cnv->mode; /* restore # of bytes to consume */ 82 i = cnv->toULength; /* restore # of bytes consumed */ 83 cnv->toULength = 0; 84 85 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ 86 cnv->toUnicodeStatus = 0; 87 goto morebytes; 88 } 89 90 91 while (mySource < sourceLimit && myTarget < targetLimit) 92 { 93 ch = *(mySource++); 94 if (U8_IS_SINGLE(ch)) /* Simple case */ 95 { 96 *(myTarget++) = (UChar) ch; 97 } 98 else 99 { 100 /* store the first char */ 101 toUBytes[0] = (char)ch; 102 inBytes = U8_COUNT_BYTES_NON_ASCII(ch); /* lookup current sequence length */ 103 i = 1; 104 105 morebytes: 106 while (i < inBytes) 107 { 108 if (mySource < sourceLimit) 109 { 110 toUBytes[i] = (char) (ch2 = *mySource); 111 if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) && 112 !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2))) 113 { 114 break; /* i < inBytes */ 115 } 116 ch = (ch << 6) + ch2; 117 ++mySource; 118 i++; 119 } 120 else 121 { 122 /* stores a partially calculated target*/ 123 cnv->toUnicodeStatus = ch; 124 cnv->mode = inBytes; 125 cnv->toULength = (int8_t) i; 126 goto donefornow; 127 } 128 } 129 130 // In CESU-8, only surrogates, not supplementary code points, are encoded directly. 131 if (i == inBytes && (!isCESU8 || i <= 3)) 132 { 133 /* Remove the accumulated high bits */ 134 ch -= offsetsFromUTF8[inBytes]; 135 136 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 137 if (ch <= MAXIMUM_UCS2) 138 { 139 /* fits in 16 bits */ 140 *(myTarget++) = (UChar) ch; 141 } 142 else 143 { 144 /* write out the surrogates */ 145 *(myTarget++) = U16_LEAD(ch); 146 ch = U16_TRAIL(ch); 147 if (myTarget < targetLimit) 148 { 149 *(myTarget++) = (UChar)ch; 150 } 151 else 152 { 153 /* Put in overflow buffer (not handled here) */ 154 cnv->UCharErrorBuffer[0] = (UChar) ch; 155 cnv->UCharErrorBufferLength = 1; 156 *err = U_BUFFER_OVERFLOW_ERROR; 157 break; 158 } 159 } 160 } 161 else 162 { 163 cnv->toULength = (int8_t)i; 164 *err = U_ILLEGAL_CHAR_FOUND; 165 break; 166 } 167 } 168 } 169 170 donefornow: 171 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 172 { 173 /* End of target buffer */ 174 *err = U_BUFFER_OVERFLOW_ERROR; 175 } 176 177 args->target = myTarget; 178 args->source = (const char *) mySource; 179 } 180 181 static void U_CALLCONV ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args, 182 UErrorCode * err) 183 { 184 UConverter *cnv = args->converter; 185 const unsigned char *mySource = (unsigned char *) args->source; 186 UChar *myTarget = args->target; 187 int32_t *myOffsets = args->offsets; 188 int32_t offsetNum = 0; 189 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 190 const UChar *targetLimit = args->targetLimit; 191 unsigned char *toUBytes = cnv->toUBytes; 192 UBool isCESU8 = hasCESU8Data(cnv); 193 uint32_t ch, ch2 = 0; 194 int32_t i, inBytes; 195 196 /* Restore size of current sequence */ 197 if (cnv->toUnicodeStatus && myTarget < targetLimit) 198 { 199 inBytes = cnv->mode; /* restore # of bytes to consume */ 200 i = cnv->toULength; /* restore # of bytes consumed */ 201 cnv->toULength = 0; 202 203 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ 204 cnv->toUnicodeStatus = 0; 205 goto morebytes; 206 } 207 208 while (mySource < sourceLimit && myTarget < targetLimit) 209 { 210 ch = *(mySource++); 211 if (U8_IS_SINGLE(ch)) /* Simple case */ 212 { 213 *(myTarget++) = (UChar) ch; 214 *(myOffsets++) = offsetNum++; 215 } 216 else 217 { 218 toUBytes[0] = (char)ch; 219 inBytes = U8_COUNT_BYTES_NON_ASCII(ch); 220 i = 1; 221 222 morebytes: 223 while (i < inBytes) 224 { 225 if (mySource < sourceLimit) 226 { 227 toUBytes[i] = (char) (ch2 = *mySource); 228 if (!icu::UTF8::isValidTrail(ch, ch2, i, inBytes) && 229 !(isCESU8 && i == 1 && ch == 0xed && U8_IS_TRAIL(ch2))) 230 { 231 break; /* i < inBytes */ 232 } 233 ch = (ch << 6) + ch2; 234 ++mySource; 235 i++; 236 } 237 else 238 { 239 cnv->toUnicodeStatus = ch; 240 cnv->mode = inBytes; 241 cnv->toULength = (int8_t)i; 242 goto donefornow; 243 } 244 } 245 246 // In CESU-8, only surrogates, not supplementary code points, are encoded directly. 247 if (i == inBytes && (!isCESU8 || i <= 3)) 248 { 249 /* Remove the accumulated high bits */ 250 ch -= offsetsFromUTF8[inBytes]; 251 252 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 253 if (ch <= MAXIMUM_UCS2) 254 { 255 /* fits in 16 bits */ 256 *(myTarget++) = (UChar) ch; 257 *(myOffsets++) = offsetNum; 258 } 259 else 260 { 261 /* write out the surrogates */ 262 *(myTarget++) = U16_LEAD(ch); 263 *(myOffsets++) = offsetNum; 264 ch = U16_TRAIL(ch); 265 if (myTarget < targetLimit) 266 { 267 *(myTarget++) = (UChar)ch; 268 *(myOffsets++) = offsetNum; 269 } 270 else 271 { 272 cnv->UCharErrorBuffer[0] = (UChar) ch; 273 cnv->UCharErrorBufferLength = 1; 274 *err = U_BUFFER_OVERFLOW_ERROR; 275 } 276 } 277 offsetNum += i; 278 } 279 else 280 { 281 cnv->toULength = (int8_t)i; 282 *err = U_ILLEGAL_CHAR_FOUND; 283 break; 284 } 285 } 286 } 287 288 donefornow: 289 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 290 { /* End of target buffer */ 291 *err = U_BUFFER_OVERFLOW_ERROR; 292 } 293 294 args->target = myTarget; 295 args->source = (const char *) mySource; 296 args->offsets = myOffsets; 297 } 298 U_CDECL_END 299 300 U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args, 301 UErrorCode * err) 302 { 303 UConverter *cnv = args->converter; 304 const UChar *mySource = args->source; 305 const UChar *sourceLimit = args->sourceLimit; 306 uint8_t *myTarget = (uint8_t *) args->target; 307 const uint8_t *targetLimit = (uint8_t *) args->targetLimit; 308 uint8_t *tempPtr; 309 UChar32 ch; 310 uint8_t tempBuf[4]; 311 int32_t indexToWrite; 312 UBool isNotCESU8 = !hasCESU8Data(cnv); 313 314 if (cnv->fromUChar32 && myTarget < targetLimit) 315 { 316 ch = cnv->fromUChar32; 317 cnv->fromUChar32 = 0; 318 goto lowsurrogate; 319 } 320 321 while (mySource < sourceLimit && myTarget < targetLimit) 322 { 323 ch = *(mySource++); 324 325 if (ch < 0x80) /* Single byte */ 326 { 327 *(myTarget++) = (uint8_t) ch; 328 } 329 else if (ch < 0x800) /* Double byte */ 330 { 331 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); 332 if (myTarget < targetLimit) 333 { 334 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); 335 } 336 else 337 { 338 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); 339 cnv->charErrorBufferLength = 1; 340 *err = U_BUFFER_OVERFLOW_ERROR; 341 } 342 } 343 else { 344 /* Check for surrogates */ 345 if(U16_IS_SURROGATE(ch) && isNotCESU8) { 346 lowsurrogate: 347 if (mySource < sourceLimit) { 348 /* test both code units */ 349 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) { 350 /* convert and consume this supplementary code point */ 351 ch=U16_GET_SUPPLEMENTARY(ch, *mySource); 352 ++mySource; 353 /* exit this condition tree */ 354 } 355 else { 356 /* this is an unpaired trail or lead code unit */ 357 /* callback(illegal) */ 358 cnv->fromUChar32 = ch; 359 *err = U_ILLEGAL_CHAR_FOUND; 360 break; 361 } 362 } 363 else { 364 /* no more input */ 365 cnv->fromUChar32 = ch; 366 break; 367 } 368 } 369 370 /* Do we write the buffer directly for speed, 371 or do we have to be careful about target buffer space? */ 372 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); 373 374 if (ch <= MAXIMUM_UCS2) { 375 indexToWrite = 2; 376 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); 377 } 378 else { 379 indexToWrite = 3; 380 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); 381 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); 382 } 383 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); 384 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); 385 386 if (tempPtr == myTarget) { 387 /* There was enough space to write the codepoint directly. */ 388 myTarget += (indexToWrite + 1); 389 } 390 else { 391 /* We might run out of room soon. Write it slowly. */ 392 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { 393 if (myTarget < targetLimit) { 394 *(myTarget++) = *tempPtr; 395 } 396 else { 397 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; 398 *err = U_BUFFER_OVERFLOW_ERROR; 399 } 400 } 401 } 402 } 403 } 404 405 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 406 { 407 *err = U_BUFFER_OVERFLOW_ERROR; 408 } 409 410 args->target = (char *) myTarget; 411 args->source = mySource; 412 } 413 414 U_CFUNC void U_CALLCONV ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, 415 UErrorCode * err) 416 { 417 UConverter *cnv = args->converter; 418 const UChar *mySource = args->source; 419 int32_t *myOffsets = args->offsets; 420 const UChar *sourceLimit = args->sourceLimit; 421 uint8_t *myTarget = (uint8_t *) args->target; 422 const uint8_t *targetLimit = (uint8_t *) args->targetLimit; 423 uint8_t *tempPtr; 424 UChar32 ch; 425 int32_t offsetNum, nextSourceIndex; 426 int32_t indexToWrite; 427 uint8_t tempBuf[4]; 428 UBool isNotCESU8 = !hasCESU8Data(cnv); 429 430 if (cnv->fromUChar32 && myTarget < targetLimit) 431 { 432 ch = cnv->fromUChar32; 433 cnv->fromUChar32 = 0; 434 offsetNum = -1; 435 nextSourceIndex = 0; 436 goto lowsurrogate; 437 } else { 438 offsetNum = 0; 439 } 440 441 while (mySource < sourceLimit && myTarget < targetLimit) 442 { 443 ch = *(mySource++); 444 445 if (ch < 0x80) /* Single byte */ 446 { 447 *(myOffsets++) = offsetNum++; 448 *(myTarget++) = (char) ch; 449 } 450 else if (ch < 0x800) /* Double byte */ 451 { 452 *(myOffsets++) = offsetNum; 453 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); 454 if (myTarget < targetLimit) 455 { 456 *(myOffsets++) = offsetNum++; 457 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); 458 } 459 else 460 { 461 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); 462 cnv->charErrorBufferLength = 1; 463 *err = U_BUFFER_OVERFLOW_ERROR; 464 } 465 } 466 else 467 /* Check for surrogates */ 468 { 469 nextSourceIndex = offsetNum + 1; 470 471 if(U16_IS_SURROGATE(ch) && isNotCESU8) { 472 lowsurrogate: 473 if (mySource < sourceLimit) { 474 /* test both code units */ 475 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) { 476 /* convert and consume this supplementary code point */ 477 ch=U16_GET_SUPPLEMENTARY(ch, *mySource); 478 ++mySource; 479 ++nextSourceIndex; 480 /* exit this condition tree */ 481 } 482 else { 483 /* this is an unpaired trail or lead code unit */ 484 /* callback(illegal) */ 485 cnv->fromUChar32 = ch; 486 *err = U_ILLEGAL_CHAR_FOUND; 487 break; 488 } 489 } 490 else { 491 /* no more input */ 492 cnv->fromUChar32 = ch; 493 break; 494 } 495 } 496 497 /* Do we write the buffer directly for speed, 498 or do we have to be careful about target buffer space? */ 499 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); 500 501 if (ch <= MAXIMUM_UCS2) { 502 indexToWrite = 2; 503 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); 504 } 505 else { 506 indexToWrite = 3; 507 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); 508 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); 509 } 510 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); 511 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); 512 513 if (tempPtr == myTarget) { 514 /* There was enough space to write the codepoint directly. */ 515 myTarget += (indexToWrite + 1); 516 myOffsets[0] = offsetNum; 517 myOffsets[1] = offsetNum; 518 myOffsets[2] = offsetNum; 519 if (indexToWrite >= 3) { 520 myOffsets[3] = offsetNum; 521 } 522 myOffsets += (indexToWrite + 1); 523 } 524 else { 525 /* We might run out of room soon. Write it slowly. */ 526 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { 527 if (myTarget < targetLimit) 528 { 529 *(myOffsets++) = offsetNum; 530 *(myTarget++) = *tempPtr; 531 } 532 else 533 { 534 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; 535 *err = U_BUFFER_OVERFLOW_ERROR; 536 } 537 } 538 } 539 offsetNum = nextSourceIndex; 540 } 541 } 542 543 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 544 { 545 *err = U_BUFFER_OVERFLOW_ERROR; 546 } 547 548 args->target = (char *) myTarget; 549 args->source = mySource; 550 args->offsets = myOffsets; 551 } 552 553 U_CDECL_BEGIN 554 static UChar32 U_CALLCONV ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args, 555 UErrorCode *err) { 556 UConverter *cnv; 557 const uint8_t *sourceInitial; 558 const uint8_t *source; 559 uint8_t myByte; 560 UChar32 ch; 561 int8_t i; 562 563 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */ 564 565 cnv = args->converter; 566 sourceInitial = source = (const uint8_t *)args->source; 567 if (source >= (const uint8_t *)args->sourceLimit) 568 { 569 /* no input */ 570 *err = U_INDEX_OUTOFBOUNDS_ERROR; 571 return 0xffff; 572 } 573 574 myByte = (uint8_t)*(source++); 575 if (U8_IS_SINGLE(myByte)) 576 { 577 args->source = (const char *)source; 578 return (UChar32)myByte; 579 } 580 581 uint16_t countTrailBytes = U8_COUNT_TRAIL_BYTES(myByte); 582 if (countTrailBytes == 0) { 583 cnv->toUBytes[0] = myByte; 584 cnv->toULength = 1; 585 *err = U_ILLEGAL_CHAR_FOUND; 586 args->source = (const char *)source; 587 return 0xffff; 588 } 589 590 /*The byte sequence is longer than the buffer area passed*/ 591 if (((const char *)source + countTrailBytes) > args->sourceLimit) 592 { 593 /* check if all of the remaining bytes are trail bytes */ 594 uint16_t extraBytesToWrite = countTrailBytes + 1; 595 cnv->toUBytes[0] = myByte; 596 i = 1; 597 *err = U_TRUNCATED_CHAR_FOUND; 598 while(source < (const uint8_t *)args->sourceLimit) { 599 uint8_t b = *source; 600 if(icu::UTF8::isValidTrail(myByte, b, i, extraBytesToWrite)) { 601 cnv->toUBytes[i++] = b; 602 ++source; 603 } else { 604 /* error even before we run out of input */ 605 *err = U_ILLEGAL_CHAR_FOUND; 606 break; 607 } 608 } 609 cnv->toULength = i; 610 args->source = (const char *)source; 611 return 0xffff; 612 } 613 614 ch = myByte << 6; 615 if(countTrailBytes == 2) { 616 uint8_t t1 = *source, t2; 617 if(U8_IS_VALID_LEAD3_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source)) { 618 args->source = (const char *)(source + 1); 619 return (((ch + t1) << 6) + t2) - offsetsFromUTF8[3]; 620 } 621 } else if(countTrailBytes == 1) { 622 uint8_t t1 = *source; 623 if(U8_IS_TRAIL(t1)) { 624 args->source = (const char *)(source + 1); 625 return (ch + t1) - offsetsFromUTF8[2]; 626 } 627 } else { // countTrailBytes == 3 628 uint8_t t1 = *source, t2, t3; 629 if(U8_IS_VALID_LEAD4_AND_T1(myByte, t1) && U8_IS_TRAIL(t2 = *++source) && 630 U8_IS_TRAIL(t3 = *++source)) { 631 args->source = (const char *)(source + 1); 632 return (((((ch + t1) << 6) + t2) << 6) + t3) - offsetsFromUTF8[4]; 633 } 634 } 635 args->source = (const char *)source; 636 637 for(i = 0; sourceInitial < source; ++i) { 638 cnv->toUBytes[i] = *sourceInitial++; 639 } 640 cnv->toULength = i; 641 *err = U_ILLEGAL_CHAR_FOUND; 642 return 0xffff; 643 } 644 U_CDECL_END 645 646 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */ 647 648 U_CDECL_BEGIN 649 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */ 650 static void U_CALLCONV 651 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 652 UConverterToUnicodeArgs *pToUArgs, 653 UErrorCode *pErrorCode) { 654 UConverter *utf8; 655 const uint8_t *source, *sourceLimit; 656 uint8_t *target; 657 int32_t targetCapacity; 658 int32_t count; 659 660 int8_t oldToULength, toULength, toULimit; 661 662 UChar32 c; 663 uint8_t b, t1, t2; 664 665 /* set up the local pointers */ 666 utf8=pToUArgs->converter; 667 source=(uint8_t *)pToUArgs->source; 668 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 669 target=(uint8_t *)pFromUArgs->target; 670 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 671 672 /* get the converter state from the UTF-8 UConverter */ 673 c=(UChar32)utf8->toUnicodeStatus; 674 if(c!=0) { 675 toULength=oldToULength=utf8->toULength; 676 toULimit=(int8_t)utf8->mode; 677 } else { 678 toULength=oldToULength=toULimit=0; 679 } 680 681 count=(int32_t)(sourceLimit-source)+oldToULength; 682 if(count<toULimit) { 683 /* 684 * Not enough input to complete the partial character. 685 * Jump to moreBytes below - it will not output to target. 686 */ 687 } else if(targetCapacity<toULimit) { 688 /* 689 * Not enough target capacity to output the partial character. 690 * Let the standard converter handle this. 691 */ 692 *pErrorCode=U_USING_DEFAULT_WARNING; 693 return; 694 } else { 695 // Use a single counter for source and target, counting the minimum of 696 // the source length and the target capacity. 697 // Let the standard converter handle edge cases. 698 const uint8_t *limit=sourceLimit; 699 if(count>targetCapacity) { 700 limit-=(count-targetCapacity); 701 count=targetCapacity; 702 } 703 704 // The conversion loop checks count>0 only once per 1/2/3-byte character. 705 // If the buffer ends with a truncated 2- or 3-byte sequence, 706 // then we reduce the count to stop before that, 707 // and collect the remaining bytes after the conversion loop. 708 { 709 // Do not go back into the bytes that will be read for finishing a partial 710 // sequence from the previous buffer. 711 int32_t length=count-toULimit; 712 if(length>0) { 713 uint8_t b1=*(limit-1); 714 if(U8_IS_SINGLE(b1)) { 715 // common ASCII character 716 } else if(U8_IS_TRAIL(b1) && length>=2) { 717 uint8_t b2=*(limit-2); 718 if(0xe0<=b2 && b2<0xf0 && U8_IS_VALID_LEAD3_AND_T1(b2, b1)) { 719 // truncated 3-byte sequence 720 count-=2; 721 } 722 } else if(0xc2<=b1 && b1<0xf0) { 723 // truncated 2- or 3-byte sequence 724 --count; 725 } 726 } 727 } 728 } 729 730 if(c!=0) { 731 utf8->toUnicodeStatus=0; 732 utf8->toULength=0; 733 goto moreBytes; 734 /* See note in ucnv_SBCSFromUTF8() about this goto. */ 735 } 736 737 /* conversion loop */ 738 while(count>0) { 739 b=*source++; 740 if(U8_IS_SINGLE(b)) { 741 /* convert ASCII */ 742 *target++=b; 743 --count; 744 continue; 745 } else { 746 if(b>=0xe0) { 747 if( /* handle U+0800..U+FFFF inline */ 748 b<0xf0 && 749 U8_IS_VALID_LEAD3_AND_T1(b, t1=source[0]) && 750 U8_IS_TRAIL(t2=source[1]) 751 ) { 752 source+=2; 753 *target++=b; 754 *target++=t1; 755 *target++=t2; 756 count-=3; 757 continue; 758 } 759 } else { 760 if( /* handle U+0080..U+07FF inline */ 761 b>=0xc2 && 762 U8_IS_TRAIL(t1=*source) 763 ) { 764 ++source; 765 *target++=b; 766 *target++=t1; 767 count-=2; 768 continue; 769 } 770 } 771 772 /* handle "complicated" and error cases, and continuing partial characters */ 773 oldToULength=0; 774 toULength=1; 775 toULimit=U8_COUNT_BYTES_NON_ASCII(b); 776 c=b; 777 moreBytes: 778 while(toULength<toULimit) { 779 if(source<sourceLimit) { 780 b=*source; 781 if(icu::UTF8::isValidTrail(c, b, toULength, toULimit)) { 782 ++source; 783 ++toULength; 784 c=(c<<6)+b; 785 } else { 786 break; /* sequence too short, stop with toULength<toULimit */ 787 } 788 } else { 789 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 790 source-=(toULength-oldToULength); 791 while(oldToULength<toULength) { 792 utf8->toUBytes[oldToULength++]=*source++; 793 } 794 utf8->toUnicodeStatus=c; 795 utf8->toULength=toULength; 796 utf8->mode=toULimit; 797 pToUArgs->source=(char *)source; 798 pFromUArgs->target=(char *)target; 799 return; 800 } 801 } 802 803 if(toULength!=toULimit) { 804 /* error handling: illegal UTF-8 byte sequence */ 805 source-=(toULength-oldToULength); 806 while(oldToULength<toULength) { 807 utf8->toUBytes[oldToULength++]=*source++; 808 } 809 utf8->toULength=toULength; 810 pToUArgs->source=(char *)source; 811 pFromUArgs->target=(char *)target; 812 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 813 return; 814 } 815 816 /* copy the legal byte sequence to the target */ 817 if(count>=toULength) { 818 int8_t i; 819 820 for(i=0; i<oldToULength; ++i) { 821 *target++=utf8->toUBytes[i]; 822 } 823 source-=(toULength-oldToULength); 824 for(; i<toULength; ++i) { 825 *target++=*source++; 826 } 827 count-=toULength; 828 } else { 829 // A supplementary character that does not fit into the target. 830 // Let the standard converter handle this. 831 source-=(toULength-oldToULength); 832 pToUArgs->source=(char *)source; 833 pFromUArgs->target=(char *)target; 834 *pErrorCode=U_USING_DEFAULT_WARNING; 835 return; 836 } 837 } 838 } 839 U_ASSERT(count>=0); 840 841 if(U_SUCCESS(*pErrorCode) && source<sourceLimit) { 842 if(target==(const uint8_t *)pFromUArgs->targetLimit) { 843 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 844 } else { 845 b=*source; 846 toULimit=U8_COUNT_BYTES(b); 847 if(toULimit>(sourceLimit-source)) { 848 /* collect a truncated byte sequence */ 849 toULength=0; 850 c=b; 851 for(;;) { 852 utf8->toUBytes[toULength++]=b; 853 if(++source==sourceLimit) { 854 /* partial byte sequence at end of source */ 855 utf8->toUnicodeStatus=c; 856 utf8->toULength=toULength; 857 utf8->mode=toULimit; 858 break; 859 } else if(!U8_IS_TRAIL(b=*source)) { 860 /* lead byte in trail byte position */ 861 utf8->toULength=toULength; 862 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 863 break; 864 } 865 c=(c<<6)+b; 866 } 867 } else { 868 /* partial-sequence target overflow: fall back to the pivoting implementation */ 869 *pErrorCode=U_USING_DEFAULT_WARNING; 870 } 871 } 872 } 873 874 /* write back the updated pointers */ 875 pToUArgs->source=(char *)source; 876 pFromUArgs->target=(char *)target; 877 } 878 879 U_CDECL_END 880 881 /* UTF-8 converter data ----------------------------------------------------- */ 882 883 static const UConverterImpl _UTF8Impl={ 884 UCNV_UTF8, 885 886 NULL, 887 NULL, 888 889 NULL, 890 NULL, 891 NULL, 892 893 ucnv_toUnicode_UTF8, 894 ucnv_toUnicode_UTF8_OFFSETS_LOGIC, 895 ucnv_fromUnicode_UTF8, 896 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 897 ucnv_getNextUChar_UTF8, 898 899 NULL, 900 NULL, 901 NULL, 902 NULL, 903 ucnv_getNonSurrogateUnicodeSet, 904 905 ucnv_UTF8FromUTF8, 906 ucnv_UTF8FromUTF8 907 }; 908 909 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */ 910 static const UConverterStaticData _UTF8StaticData={ 911 sizeof(UConverterStaticData), 912 "UTF-8", 913 1208, UCNV_IBM, UCNV_UTF8, 914 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ 915 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, 916 0, 917 0, 918 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 919 }; 920 921 922 const UConverterSharedData _UTF8Data= 923 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl); 924 925 /* CESU-8 converter data ---------------------------------------------------- */ 926 927 static const UConverterImpl _CESU8Impl={ 928 UCNV_CESU8, 929 930 NULL, 931 NULL, 932 933 NULL, 934 NULL, 935 NULL, 936 937 ucnv_toUnicode_UTF8, 938 ucnv_toUnicode_UTF8_OFFSETS_LOGIC, 939 ucnv_fromUnicode_UTF8, 940 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 941 NULL, 942 943 NULL, 944 NULL, 945 NULL, 946 NULL, 947 ucnv_getCompleteUnicodeSet, 948 949 NULL, 950 NULL 951 }; 952 953 static const UConverterStaticData _CESU8StaticData={ 954 sizeof(UConverterStaticData), 955 "CESU-8", 956 9400, /* CCSID for CESU-8 */ 957 UCNV_UNKNOWN, UCNV_CESU8, 1, 3, 958 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, 959 0, 960 0, 961 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 962 }; 963 964 965 const UConverterSharedData _CESU8Data= 966 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl); 967 968 #endif 969