1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2002-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * file name: ucnv_u8.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002jul01 14 * created by: Markus W. Scherer 15 * 16 * UTF-8 converter implementation. Used to be in ucnv_utf.c. 17 * 18 * Also, CESU-8 implementation, see UTR 26. 19 * The CESU-8 converter uses all the same functions as the 20 * UTF-8 converter, with a branch for converting supplementary code points. 21 */ 22 23 #include "unicode/utypes.h" 24 25 #if !UCONFIG_NO_CONVERSION 26 27 #include "unicode/ucnv.h" 28 #include "unicode/utf.h" 29 #include "unicode/utf8.h" 30 #include "unicode/utf16.h" 31 #include "ucnv_bld.h" 32 #include "ucnv_cnv.h" 33 #include "cmemory.h" 34 35 /* Prototypes --------------------------------------------------------------- */ 36 37 /* Keep these here to make finicky compilers happy */ 38 39 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args, 40 UErrorCode *err); 41 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args, 42 UErrorCode *err); 43 44 45 /* UTF-8 -------------------------------------------------------------------- */ 46 47 /* UTF-8 Conversion DATA 48 * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9 49 */ 50 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/ 51 #define MAXIMUM_UCS2 0x0000FFFF 52 #define MAXIMUM_UTF 0x0010FFFF 53 #define MAXIMUM_UCS4 0x7FFFFFFF 54 #define HALF_SHIFT 10 55 #define HALF_BASE 0x0010000 56 #define HALF_MASK 0x3FF 57 #define SURROGATE_HIGH_START 0xD800 58 #define SURROGATE_HIGH_END 0xDBFF 59 #define SURROGATE_LOW_START 0xDC00 60 #define SURROGATE_LOW_END 0xDFFF 61 62 /* -SURROGATE_LOW_START + HALF_BASE */ 63 #define SURROGATE_LOW_BASE 9216 64 65 static const uint32_t offsetsFromUTF8[7] = {0, 66 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080, 67 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080 68 }; 69 70 /* END OF UTF-8 Conversion DATA */ 71 72 static const int8_t bytesFromUTF8[256] = { 73 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 74 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 75 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 76 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 77 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 80 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 81 }; 82 83 /* 84 * Starting with Unicode 3.0.1: 85 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N]; 86 * byte sequences with more than 4 bytes are illegal in UTF-8, 87 * which is tested with impossible values for them 88 */ 89 static const uint32_t 90 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff }; 91 92 static UBool hasCESU8Data(const UConverter *cnv) 93 { 94 #if UCONFIG_ONLY_HTML_CONVERSION 95 return FALSE; 96 #else 97 return (UBool)(cnv->sharedData == &_CESU8Data); 98 #endif 99 } 100 101 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args, 102 UErrorCode * err) 103 { 104 UConverter *cnv = args->converter; 105 const unsigned char *mySource = (unsigned char *) args->source; 106 UChar *myTarget = args->target; 107 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 108 const UChar *targetLimit = args->targetLimit; 109 unsigned char *toUBytes = cnv->toUBytes; 110 UBool isCESU8 = hasCESU8Data(cnv); 111 uint32_t ch, ch2 = 0; 112 int32_t i, inBytes; 113 114 /* Restore size of current sequence */ 115 if (cnv->toUnicodeStatus && myTarget < targetLimit) 116 { 117 inBytes = cnv->mode; /* restore # of bytes to consume */ 118 i = cnv->toULength; /* restore # of bytes consumed */ 119 cnv->toULength = 0; 120 121 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ 122 cnv->toUnicodeStatus = 0; 123 goto morebytes; 124 } 125 126 127 while (mySource < sourceLimit && myTarget < targetLimit) 128 { 129 ch = *(mySource++); 130 if (ch < 0x80) /* Simple case */ 131 { 132 *(myTarget++) = (UChar) ch; 133 } 134 else 135 { 136 /* store the first char */ 137 toUBytes[0] = (char)ch; 138 inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */ 139 i = 1; 140 141 morebytes: 142 while (i < inBytes) 143 { 144 if (mySource < sourceLimit) 145 { 146 toUBytes[i] = (char) (ch2 = *mySource); 147 if (!U8_IS_TRAIL(ch2)) 148 { 149 break; /* i < inBytes */ 150 } 151 ch = (ch << 6) + ch2; 152 ++mySource; 153 i++; 154 } 155 else 156 { 157 /* stores a partially calculated target*/ 158 cnv->toUnicodeStatus = ch; 159 cnv->mode = inBytes; 160 cnv->toULength = (int8_t) i; 161 goto donefornow; 162 } 163 } 164 165 /* Remove the accumulated high bits */ 166 ch -= offsetsFromUTF8[inBytes]; 167 168 /* 169 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: 170 * - use only trail bytes after a lead byte (checked above) 171 * - use the right number of trail bytes for a given lead byte 172 * - encode a code point <= U+10ffff 173 * - use the fewest possible number of bytes for their code points 174 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) 175 * 176 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. 177 * There are no irregular sequences any more. 178 * In CESU-8, only surrogates, not supplementary code points, are encoded directly. 179 */ 180 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && 181 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch))) 182 { 183 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 184 if (ch <= MAXIMUM_UCS2) 185 { 186 /* fits in 16 bits */ 187 *(myTarget++) = (UChar) ch; 188 } 189 else 190 { 191 /* write out the surrogates */ 192 ch -= HALF_BASE; 193 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START); 194 ch = (ch & HALF_MASK) + SURROGATE_LOW_START; 195 if (myTarget < targetLimit) 196 { 197 *(myTarget++) = (UChar)ch; 198 } 199 else 200 { 201 /* Put in overflow buffer (not handled here) */ 202 cnv->UCharErrorBuffer[0] = (UChar) ch; 203 cnv->UCharErrorBufferLength = 1; 204 *err = U_BUFFER_OVERFLOW_ERROR; 205 break; 206 } 207 } 208 } 209 else 210 { 211 cnv->toULength = (int8_t)i; 212 *err = U_ILLEGAL_CHAR_FOUND; 213 break; 214 } 215 } 216 } 217 218 donefornow: 219 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 220 { 221 /* End of target buffer */ 222 *err = U_BUFFER_OVERFLOW_ERROR; 223 } 224 225 args->target = myTarget; 226 args->source = (const char *) mySource; 227 } 228 229 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args, 230 UErrorCode * err) 231 { 232 UConverter *cnv = args->converter; 233 const unsigned char *mySource = (unsigned char *) args->source; 234 UChar *myTarget = args->target; 235 int32_t *myOffsets = args->offsets; 236 int32_t offsetNum = 0; 237 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 238 const UChar *targetLimit = args->targetLimit; 239 unsigned char *toUBytes = cnv->toUBytes; 240 UBool isCESU8 = hasCESU8Data(cnv); 241 uint32_t ch, ch2 = 0; 242 int32_t i, inBytes; 243 244 /* Restore size of current sequence */ 245 if (cnv->toUnicodeStatus && myTarget < targetLimit) 246 { 247 inBytes = cnv->mode; /* restore # of bytes to consume */ 248 i = cnv->toULength; /* restore # of bytes consumed */ 249 cnv->toULength = 0; 250 251 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ 252 cnv->toUnicodeStatus = 0; 253 goto morebytes; 254 } 255 256 while (mySource < sourceLimit && myTarget < targetLimit) 257 { 258 ch = *(mySource++); 259 if (ch < 0x80) /* Simple case */ 260 { 261 *(myTarget++) = (UChar) ch; 262 *(myOffsets++) = offsetNum++; 263 } 264 else 265 { 266 toUBytes[0] = (char)ch; 267 inBytes = bytesFromUTF8[ch]; 268 i = 1; 269 270 morebytes: 271 while (i < inBytes) 272 { 273 if (mySource < sourceLimit) 274 { 275 toUBytes[i] = (char) (ch2 = *mySource); 276 if (!U8_IS_TRAIL(ch2)) 277 { 278 break; /* i < inBytes */ 279 } 280 ch = (ch << 6) + ch2; 281 ++mySource; 282 i++; 283 } 284 else 285 { 286 cnv->toUnicodeStatus = ch; 287 cnv->mode = inBytes; 288 cnv->toULength = (int8_t)i; 289 goto donefornow; 290 } 291 } 292 293 /* Remove the accumulated high bits */ 294 ch -= offsetsFromUTF8[inBytes]; 295 296 /* 297 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: 298 * - use only trail bytes after a lead byte (checked above) 299 * - use the right number of trail bytes for a given lead byte 300 * - encode a code point <= U+10ffff 301 * - use the fewest possible number of bytes for their code points 302 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) 303 * 304 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. 305 * There are no irregular sequences any more. 306 * In CESU-8, only surrogates, not supplementary code points, are encoded directly. 307 */ 308 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && 309 (isCESU8 ? i <= 3 : !U_IS_SURROGATE(ch))) 310 { 311 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 312 if (ch <= MAXIMUM_UCS2) 313 { 314 /* fits in 16 bits */ 315 *(myTarget++) = (UChar) ch; 316 *(myOffsets++) = offsetNum; 317 } 318 else 319 { 320 /* write out the surrogates */ 321 ch -= HALF_BASE; 322 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START); 323 *(myOffsets++) = offsetNum; 324 ch = (ch & HALF_MASK) + SURROGATE_LOW_START; 325 if (myTarget < targetLimit) 326 { 327 *(myTarget++) = (UChar)ch; 328 *(myOffsets++) = offsetNum; 329 } 330 else 331 { 332 cnv->UCharErrorBuffer[0] = (UChar) ch; 333 cnv->UCharErrorBufferLength = 1; 334 *err = U_BUFFER_OVERFLOW_ERROR; 335 } 336 } 337 offsetNum += i; 338 } 339 else 340 { 341 cnv->toULength = (int8_t)i; 342 *err = U_ILLEGAL_CHAR_FOUND; 343 break; 344 } 345 } 346 } 347 348 donefornow: 349 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 350 { /* End of target buffer */ 351 *err = U_BUFFER_OVERFLOW_ERROR; 352 } 353 354 args->target = myTarget; 355 args->source = (const char *) mySource; 356 args->offsets = myOffsets; 357 } 358 359 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args, 360 UErrorCode * err) 361 { 362 UConverter *cnv = args->converter; 363 const UChar *mySource = args->source; 364 const UChar *sourceLimit = args->sourceLimit; 365 uint8_t *myTarget = (uint8_t *) args->target; 366 const uint8_t *targetLimit = (uint8_t *) args->targetLimit; 367 uint8_t *tempPtr; 368 UChar32 ch; 369 uint8_t tempBuf[4]; 370 int32_t indexToWrite; 371 UBool isNotCESU8 = !hasCESU8Data(cnv); 372 373 if (cnv->fromUChar32 && myTarget < targetLimit) 374 { 375 ch = cnv->fromUChar32; 376 cnv->fromUChar32 = 0; 377 goto lowsurrogate; 378 } 379 380 while (mySource < sourceLimit && myTarget < targetLimit) 381 { 382 ch = *(mySource++); 383 384 if (ch < 0x80) /* Single byte */ 385 { 386 *(myTarget++) = (uint8_t) ch; 387 } 388 else if (ch < 0x800) /* Double byte */ 389 { 390 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); 391 if (myTarget < targetLimit) 392 { 393 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); 394 } 395 else 396 { 397 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); 398 cnv->charErrorBufferLength = 1; 399 *err = U_BUFFER_OVERFLOW_ERROR; 400 } 401 } 402 else { 403 /* Check for surrogates */ 404 if(U16_IS_SURROGATE(ch) && isNotCESU8) { 405 lowsurrogate: 406 if (mySource < sourceLimit) { 407 /* test both code units */ 408 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) { 409 /* convert and consume this supplementary code point */ 410 ch=U16_GET_SUPPLEMENTARY(ch, *mySource); 411 ++mySource; 412 /* exit this condition tree */ 413 } 414 else { 415 /* this is an unpaired trail or lead code unit */ 416 /* callback(illegal) */ 417 cnv->fromUChar32 = ch; 418 *err = U_ILLEGAL_CHAR_FOUND; 419 break; 420 } 421 } 422 else { 423 /* no more input */ 424 cnv->fromUChar32 = ch; 425 break; 426 } 427 } 428 429 /* Do we write the buffer directly for speed, 430 or do we have to be careful about target buffer space? */ 431 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); 432 433 if (ch <= MAXIMUM_UCS2) { 434 indexToWrite = 2; 435 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); 436 } 437 else { 438 indexToWrite = 3; 439 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); 440 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); 441 } 442 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); 443 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); 444 445 if (tempPtr == myTarget) { 446 /* There was enough space to write the codepoint directly. */ 447 myTarget += (indexToWrite + 1); 448 } 449 else { 450 /* We might run out of room soon. Write it slowly. */ 451 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { 452 if (myTarget < targetLimit) { 453 *(myTarget++) = *tempPtr; 454 } 455 else { 456 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; 457 *err = U_BUFFER_OVERFLOW_ERROR; 458 } 459 } 460 } 461 } 462 } 463 464 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 465 { 466 *err = U_BUFFER_OVERFLOW_ERROR; 467 } 468 469 args->target = (char *) myTarget; 470 args->source = mySource; 471 } 472 473 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, 474 UErrorCode * err) 475 { 476 UConverter *cnv = args->converter; 477 const UChar *mySource = args->source; 478 int32_t *myOffsets = args->offsets; 479 const UChar *sourceLimit = args->sourceLimit; 480 uint8_t *myTarget = (uint8_t *) args->target; 481 const uint8_t *targetLimit = (uint8_t *) args->targetLimit; 482 uint8_t *tempPtr; 483 UChar32 ch; 484 int32_t offsetNum, nextSourceIndex; 485 int32_t indexToWrite; 486 uint8_t tempBuf[4]; 487 UBool isNotCESU8 = !hasCESU8Data(cnv); 488 489 if (cnv->fromUChar32 && myTarget < targetLimit) 490 { 491 ch = cnv->fromUChar32; 492 cnv->fromUChar32 = 0; 493 offsetNum = -1; 494 nextSourceIndex = 0; 495 goto lowsurrogate; 496 } else { 497 offsetNum = 0; 498 } 499 500 while (mySource < sourceLimit && myTarget < targetLimit) 501 { 502 ch = *(mySource++); 503 504 if (ch < 0x80) /* Single byte */ 505 { 506 *(myOffsets++) = offsetNum++; 507 *(myTarget++) = (char) ch; 508 } 509 else if (ch < 0x800) /* Double byte */ 510 { 511 *(myOffsets++) = offsetNum; 512 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); 513 if (myTarget < targetLimit) 514 { 515 *(myOffsets++) = offsetNum++; 516 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); 517 } 518 else 519 { 520 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); 521 cnv->charErrorBufferLength = 1; 522 *err = U_BUFFER_OVERFLOW_ERROR; 523 } 524 } 525 else 526 /* Check for surrogates */ 527 { 528 nextSourceIndex = offsetNum + 1; 529 530 if(U16_IS_SURROGATE(ch) && isNotCESU8) { 531 lowsurrogate: 532 if (mySource < sourceLimit) { 533 /* test both code units */ 534 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(*mySource)) { 535 /* convert and consume this supplementary code point */ 536 ch=U16_GET_SUPPLEMENTARY(ch, *mySource); 537 ++mySource; 538 ++nextSourceIndex; 539 /* exit this condition tree */ 540 } 541 else { 542 /* this is an unpaired trail or lead code unit */ 543 /* callback(illegal) */ 544 cnv->fromUChar32 = ch; 545 *err = U_ILLEGAL_CHAR_FOUND; 546 break; 547 } 548 } 549 else { 550 /* no more input */ 551 cnv->fromUChar32 = ch; 552 break; 553 } 554 } 555 556 /* Do we write the buffer directly for speed, 557 or do we have to be careful about target buffer space? */ 558 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); 559 560 if (ch <= MAXIMUM_UCS2) { 561 indexToWrite = 2; 562 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); 563 } 564 else { 565 indexToWrite = 3; 566 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); 567 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); 568 } 569 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); 570 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); 571 572 if (tempPtr == myTarget) { 573 /* There was enough space to write the codepoint directly. */ 574 myTarget += (indexToWrite + 1); 575 myOffsets[0] = offsetNum; 576 myOffsets[1] = offsetNum; 577 myOffsets[2] = offsetNum; 578 if (indexToWrite >= 3) { 579 myOffsets[3] = offsetNum; 580 } 581 myOffsets += (indexToWrite + 1); 582 } 583 else { 584 /* We might run out of room soon. Write it slowly. */ 585 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { 586 if (myTarget < targetLimit) 587 { 588 *(myOffsets++) = offsetNum; 589 *(myTarget++) = *tempPtr; 590 } 591 else 592 { 593 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; 594 *err = U_BUFFER_OVERFLOW_ERROR; 595 } 596 } 597 } 598 offsetNum = nextSourceIndex; 599 } 600 } 601 602 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 603 { 604 *err = U_BUFFER_OVERFLOW_ERROR; 605 } 606 607 args->target = (char *) myTarget; 608 args->source = mySource; 609 args->offsets = myOffsets; 610 } 611 612 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args, 613 UErrorCode *err) { 614 UConverter *cnv; 615 const uint8_t *sourceInitial; 616 const uint8_t *source; 617 uint16_t extraBytesToWrite; 618 uint8_t myByte; 619 UChar32 ch; 620 int8_t i, isLegalSequence; 621 622 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */ 623 624 cnv = args->converter; 625 sourceInitial = source = (const uint8_t *)args->source; 626 if (source >= (const uint8_t *)args->sourceLimit) 627 { 628 /* no input */ 629 *err = U_INDEX_OUTOFBOUNDS_ERROR; 630 return 0xffff; 631 } 632 633 myByte = (uint8_t)*(source++); 634 if (myByte < 0x80) 635 { 636 args->source = (const char *)source; 637 return (UChar32)myByte; 638 } 639 640 extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte]; 641 if (extraBytesToWrite == 0) { 642 cnv->toUBytes[0] = myByte; 643 cnv->toULength = 1; 644 *err = U_ILLEGAL_CHAR_FOUND; 645 args->source = (const char *)source; 646 return 0xffff; 647 } 648 649 /*The byte sequence is longer than the buffer area passed*/ 650 if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit) 651 { 652 /* check if all of the remaining bytes are trail bytes */ 653 cnv->toUBytes[0] = myByte; 654 i = 1; 655 *err = U_TRUNCATED_CHAR_FOUND; 656 while(source < (const uint8_t *)args->sourceLimit) { 657 if(U8_IS_TRAIL(myByte = *source)) { 658 cnv->toUBytes[i++] = myByte; 659 ++source; 660 } else { 661 /* error even before we run out of input */ 662 *err = U_ILLEGAL_CHAR_FOUND; 663 break; 664 } 665 } 666 cnv->toULength = i; 667 args->source = (const char *)source; 668 return 0xffff; 669 } 670 671 isLegalSequence = 1; 672 ch = myByte << 6; 673 switch(extraBytesToWrite) 674 { 675 /* note: code falls through cases! (sic)*/ 676 case 6: 677 ch += (myByte = *source); 678 ch <<= 6; 679 if (!U8_IS_TRAIL(myByte)) 680 { 681 isLegalSequence = 0; 682 break; 683 } 684 ++source; 685 U_FALLTHROUGH; 686 case 5: 687 ch += (myByte = *source); 688 ch <<= 6; 689 if (!U8_IS_TRAIL(myByte)) 690 { 691 isLegalSequence = 0; 692 break; 693 } 694 ++source; 695 U_FALLTHROUGH; 696 case 4: 697 ch += (myByte = *source); 698 ch <<= 6; 699 if (!U8_IS_TRAIL(myByte)) 700 { 701 isLegalSequence = 0; 702 break; 703 } 704 ++source; 705 U_FALLTHROUGH; 706 case 3: 707 ch += (myByte = *source); 708 ch <<= 6; 709 if (!U8_IS_TRAIL(myByte)) 710 { 711 isLegalSequence = 0; 712 break; 713 } 714 ++source; 715 U_FALLTHROUGH; 716 case 2: 717 ch += (myByte = *source); 718 if (!U8_IS_TRAIL(myByte)) 719 { 720 isLegalSequence = 0; 721 break; 722 } 723 ++source; 724 }; 725 ch -= offsetsFromUTF8[extraBytesToWrite]; 726 args->source = (const char *)source; 727 728 /* 729 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: 730 * - use only trail bytes after a lead byte (checked above) 731 * - use the right number of trail bytes for a given lead byte 732 * - encode a code point <= U+10ffff 733 * - use the fewest possible number of bytes for their code points 734 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) 735 * 736 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. 737 * There are no irregular sequences any more. 738 */ 739 if (isLegalSequence && 740 (uint32_t)ch <= MAXIMUM_UTF && 741 (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] && 742 !U_IS_SURROGATE(ch) 743 ) { 744 return ch; /* return the code point */ 745 } 746 747 for(i = 0; sourceInitial < source; ++i) { 748 cnv->toUBytes[i] = *sourceInitial++; 749 } 750 cnv->toULength = i; 751 *err = U_ILLEGAL_CHAR_FOUND; 752 return 0xffff; 753 } 754 755 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */ 756 757 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ 758 static const UChar32 759 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; 760 761 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ 762 static const UChar32 763 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; 764 765 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */ 766 static void 767 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 768 UConverterToUnicodeArgs *pToUArgs, 769 UErrorCode *pErrorCode) { 770 UConverter *utf8; 771 const uint8_t *source, *sourceLimit; 772 uint8_t *target; 773 int32_t targetCapacity; 774 int32_t count; 775 776 int8_t oldToULength, toULength, toULimit; 777 778 UChar32 c; 779 uint8_t b, t1, t2; 780 781 /* set up the local pointers */ 782 utf8=pToUArgs->converter; 783 source=(uint8_t *)pToUArgs->source; 784 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 785 target=(uint8_t *)pFromUArgs->target; 786 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 787 788 /* get the converter state from the UTF-8 UConverter */ 789 c=(UChar32)utf8->toUnicodeStatus; 790 if(c!=0) { 791 toULength=oldToULength=utf8->toULength; 792 toULimit=(int8_t)utf8->mode; 793 } else { 794 toULength=oldToULength=toULimit=0; 795 } 796 797 count=(int32_t)(sourceLimit-source)+oldToULength; 798 if(count<toULimit) { 799 /* 800 * Not enough input to complete the partial character. 801 * Jump to moreBytes below - it will not output to target. 802 */ 803 } else if(targetCapacity<toULimit) { 804 /* 805 * Not enough target capacity to output the partial character. 806 * Let the standard converter handle this. 807 */ 808 *pErrorCode=U_USING_DEFAULT_WARNING; 809 return; 810 } else { 811 /* 812 * Use a single counter for source and target, counting the minimum of 813 * the source length and the target capacity. 814 * As a result, the source length is checked only once per multi-byte 815 * character instead of twice. 816 * 817 * Make sure that the last byte sequence is complete, or else 818 * stop just before it. 819 * (The longest legal byte sequence has 3 trail bytes.) 820 * Count oldToULength (number of source bytes from a previous buffer) 821 * into the source length but reduce the source index by toULimit 822 * while going back over trail bytes in order to not go back into 823 * the bytes that will be read for finishing a partial 824 * sequence from the previous buffer. 825 * Let the standard converter handle edge cases. 826 */ 827 int32_t i; 828 829 if(count>targetCapacity) { 830 count=targetCapacity; 831 } 832 833 i=0; 834 while(i<3 && i<(count-toULimit)) { 835 b=source[count-oldToULength-i-1]; 836 if(U8_IS_TRAIL(b)) { 837 ++i; 838 } else { 839 if(i<U8_COUNT_TRAIL_BYTES(b)) { 840 /* stop converting before the lead byte if there are not enough trail bytes for it */ 841 count-=i+1; 842 } 843 break; 844 } 845 } 846 } 847 848 if(c!=0) { 849 utf8->toUnicodeStatus=0; 850 utf8->toULength=0; 851 goto moreBytes; 852 /* See note in ucnv_SBCSFromUTF8() about this goto. */ 853 } 854 855 /* conversion loop */ 856 while(count>0) { 857 b=*source++; 858 if((int8_t)b>=0) { 859 /* convert ASCII */ 860 *target++=b; 861 --count; 862 continue; 863 } else { 864 if(b>0xe0) { 865 if( /* handle U+1000..U+D7FF inline */ 866 (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) || 867 (b==0xed && (t1 <= 0x9f))) && 868 (t2=source[1]) >= 0x80 && t2 <= 0xbf 869 ) { 870 source+=2; 871 *target++=b; 872 *target++=t1; 873 *target++=t2; 874 count-=3; 875 continue; 876 } 877 } else if(b<0xe0) { 878 if( /* handle U+0080..U+07FF inline */ 879 b>=0xc2 && 880 (t1=*source) >= 0x80 && t1 <= 0xbf 881 ) { 882 ++source; 883 *target++=b; 884 *target++=t1; 885 count-=2; 886 continue; 887 } 888 } else if(b==0xe0) { 889 if( /* handle U+0800..U+0FFF inline */ 890 (t1=source[0]) >= 0xa0 && t1 <= 0xbf && 891 (t2=source[1]) >= 0x80 && t2 <= 0xbf 892 ) { 893 source+=2; 894 *target++=b; 895 *target++=t1; 896 *target++=t2; 897 count-=3; 898 continue; 899 } 900 } 901 902 /* handle "complicated" and error cases, and continuing partial characters */ 903 oldToULength=0; 904 toULength=1; 905 toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 906 c=b; 907 moreBytes: 908 while(toULength<toULimit) { 909 if(source<sourceLimit) { 910 b=*source; 911 if(U8_IS_TRAIL(b)) { 912 ++source; 913 ++toULength; 914 c=(c<<6)+b; 915 } else { 916 break; /* sequence too short, stop with toULength<toULimit */ 917 } 918 } else { 919 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 920 source-=(toULength-oldToULength); 921 while(oldToULength<toULength) { 922 utf8->toUBytes[oldToULength++]=*source++; 923 } 924 utf8->toUnicodeStatus=c; 925 utf8->toULength=toULength; 926 utf8->mode=toULimit; 927 pToUArgs->source=(char *)source; 928 pFromUArgs->target=(char *)target; 929 return; 930 } 931 } 932 933 if( toULength==toULimit && /* consumed all trail bytes */ 934 (toULength==3 || toULength==2) && /* BMP */ 935 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 936 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 937 ) { 938 /* legal byte sequence for BMP code point */ 939 } else if( 940 toULength==toULimit && toULength==4 && 941 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 942 ) { 943 /* legal byte sequence for supplementary code point */ 944 } else { 945 /* error handling: illegal UTF-8 byte sequence */ 946 source-=(toULength-oldToULength); 947 while(oldToULength<toULength) { 948 utf8->toUBytes[oldToULength++]=*source++; 949 } 950 utf8->toULength=toULength; 951 pToUArgs->source=(char *)source; 952 pFromUArgs->target=(char *)target; 953 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 954 return; 955 } 956 957 /* copy the legal byte sequence to the target */ 958 { 959 int8_t i; 960 961 for(i=0; i<oldToULength; ++i) { 962 *target++=utf8->toUBytes[i]; 963 } 964 source-=(toULength-oldToULength); 965 for(; i<toULength; ++i) { 966 *target++=*source++; 967 } 968 count-=toULength; 969 } 970 } 971 } 972 973 if(U_SUCCESS(*pErrorCode) && source<sourceLimit) { 974 if(target==(const uint8_t *)pFromUArgs->targetLimit) { 975 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 976 } else { 977 b=*source; 978 toULimit=U8_COUNT_TRAIL_BYTES(b)+1; 979 if(toULimit>(sourceLimit-source)) { 980 /* collect a truncated byte sequence */ 981 toULength=0; 982 c=b; 983 for(;;) { 984 utf8->toUBytes[toULength++]=b; 985 if(++source==sourceLimit) { 986 /* partial byte sequence at end of source */ 987 utf8->toUnicodeStatus=c; 988 utf8->toULength=toULength; 989 utf8->mode=toULimit; 990 break; 991 } else if(!U8_IS_TRAIL(b=*source)) { 992 /* lead byte in trail byte position */ 993 utf8->toULength=toULength; 994 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 995 break; 996 } 997 c=(c<<6)+b; 998 } 999 } else { 1000 /* partial-sequence target overflow: fall back to the pivoting implementation */ 1001 *pErrorCode=U_USING_DEFAULT_WARNING; 1002 } 1003 } 1004 } 1005 1006 /* write back the updated pointers */ 1007 pToUArgs->source=(char *)source; 1008 pFromUArgs->target=(char *)target; 1009 } 1010 1011 /* UTF-8 converter data ----------------------------------------------------- */ 1012 1013 static const UConverterImpl _UTF8Impl={ 1014 UCNV_UTF8, 1015 1016 NULL, 1017 NULL, 1018 1019 NULL, 1020 NULL, 1021 NULL, 1022 1023 ucnv_toUnicode_UTF8, 1024 ucnv_toUnicode_UTF8_OFFSETS_LOGIC, 1025 ucnv_fromUnicode_UTF8, 1026 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 1027 ucnv_getNextUChar_UTF8, 1028 1029 NULL, 1030 NULL, 1031 NULL, 1032 NULL, 1033 ucnv_getNonSurrogateUnicodeSet, 1034 1035 ucnv_UTF8FromUTF8, 1036 ucnv_UTF8FromUTF8 1037 }; 1038 1039 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */ 1040 static const UConverterStaticData _UTF8StaticData={ 1041 sizeof(UConverterStaticData), 1042 "UTF-8", 1043 1208, UCNV_IBM, UCNV_UTF8, 1044 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ 1045 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, 1046 0, 1047 0, 1048 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1049 }; 1050 1051 1052 const UConverterSharedData _UTF8Data= 1053 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_UTF8StaticData, &_UTF8Impl); 1054 1055 /* CESU-8 converter data ---------------------------------------------------- */ 1056 1057 static const UConverterImpl _CESU8Impl={ 1058 UCNV_CESU8, 1059 1060 NULL, 1061 NULL, 1062 1063 NULL, 1064 NULL, 1065 NULL, 1066 1067 ucnv_toUnicode_UTF8, 1068 ucnv_toUnicode_UTF8_OFFSETS_LOGIC, 1069 ucnv_fromUnicode_UTF8, 1070 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 1071 NULL, 1072 1073 NULL, 1074 NULL, 1075 NULL, 1076 NULL, 1077 ucnv_getCompleteUnicodeSet, 1078 1079 NULL, 1080 NULL 1081 }; 1082 1083 static const UConverterStaticData _CESU8StaticData={ 1084 sizeof(UConverterStaticData), 1085 "CESU-8", 1086 9400, /* CCSID for CESU-8 */ 1087 UCNV_UNKNOWN, UCNV_CESU8, 1, 3, 1088 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, 1089 0, 1090 0, 1091 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1092 }; 1093 1094 1095 const UConverterSharedData _CESU8Data= 1096 UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_CESU8StaticData, &_CESU8Impl); 1097 1098 #endif 1099