1 /* 2 ********************************************************************** 3 * Copyright (C) 2002-2007, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: ucnv_u8.c 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2002jul01 12 * created by: Markus W. Scherer 13 * 14 * UTF-8 converter implementation. Used to be in ucnv_utf.c. 15 * 16 * Also, CESU-8 implementation, see UTR 26. 17 * The CESU-8 converter uses all the same functions as the 18 * UTF-8 converter, with a branch for converting supplementary code points. 19 */ 20 21 #include "unicode/utypes.h" 22 23 #if !UCONFIG_NO_CONVERSION 24 25 #include "unicode/ucnv.h" 26 #include "ucnv_bld.h" 27 #include "ucnv_cnv.h" 28 #include "cmemory.h" 29 30 /* Prototypes --------------------------------------------------------------- */ 31 32 /* Keep these here to make finicky compilers happy */ 33 34 U_CFUNC void ucnv_fromUnicode_UTF8(UConverterFromUnicodeArgs *args, 35 UErrorCode *err); 36 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC(UConverterFromUnicodeArgs *args, 37 UErrorCode *err); 38 39 40 /* UTF-8 -------------------------------------------------------------------- */ 41 42 /* UTF-8 Conversion DATA 43 * for more information see Unicode Standard 2.0, Transformation Formats Appendix A-9 44 */ 45 /*static const uint32_t REPLACEMENT_CHARACTER = 0x0000FFFD;*/ 46 #define MAXIMUM_UCS2 0x0000FFFF 47 #define MAXIMUM_UTF 0x0010FFFF 48 #define MAXIMUM_UCS4 0x7FFFFFFF 49 #define HALF_SHIFT 10 50 #define HALF_BASE 0x0010000 51 #define HALF_MASK 0x3FF 52 #define SURROGATE_HIGH_START 0xD800 53 #define SURROGATE_HIGH_END 0xDBFF 54 #define SURROGATE_LOW_START 0xDC00 55 #define SURROGATE_LOW_END 0xDFFF 56 57 /* -SURROGATE_LOW_START + HALF_BASE */ 58 #define SURROGATE_LOW_BASE 9216 59 60 static const uint32_t offsetsFromUTF8[7] = {0, 61 (uint32_t) 0x00000000, (uint32_t) 0x00003080, (uint32_t) 0x000E2080, 62 (uint32_t) 0x03C82080, (uint32_t) 0xFA082080, (uint32_t) 0x82082080 63 }; 64 65 /* END OF UTF-8 Conversion DATA */ 66 67 static const int8_t bytesFromUTF8[256] = { 68 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 69 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 70 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 71 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 72 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 74 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 75 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 76 }; 77 78 /* 79 * Starting with Unicode 3.0.1: 80 * UTF-8 byte sequences of length N _must_ encode code points of or above utf8_minChar32[N]; 81 * byte sequences with more than 4 bytes are illegal in UTF-8, 82 * which is tested with impossible values for them 83 */ 84 static const uint32_t 85 utf8_minChar32[7]={ 0, 0, 0x80, 0x800, 0x10000, 0xffffffff, 0xffffffff }; 86 87 static void ucnv_toUnicode_UTF8 (UConverterToUnicodeArgs * args, 88 UErrorCode * err) 89 { 90 UConverter *cnv = args->converter; 91 const unsigned char *mySource = (unsigned char *) args->source; 92 UChar *myTarget = args->target; 93 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 94 const UChar *targetLimit = args->targetLimit; 95 unsigned char *toUBytes = cnv->toUBytes; 96 UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data); 97 uint32_t ch, ch2 = 0; 98 int32_t i, inBytes; 99 100 /* Restore size of current sequence */ 101 if (cnv->toUnicodeStatus && myTarget < targetLimit) 102 { 103 inBytes = cnv->mode; /* restore # of bytes to consume */ 104 i = cnv->toULength; /* restore # of bytes consumed */ 105 cnv->toULength = 0; 106 107 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ 108 cnv->toUnicodeStatus = 0; 109 goto morebytes; 110 } 111 112 113 while (mySource < sourceLimit && myTarget < targetLimit) 114 { 115 ch = *(mySource++); 116 if (ch < 0x80) /* Simple case */ 117 { 118 *(myTarget++) = (UChar) ch; 119 } 120 else 121 { 122 /* store the first char */ 123 toUBytes[0] = (char)ch; 124 inBytes = bytesFromUTF8[ch]; /* lookup current sequence length */ 125 i = 1; 126 127 morebytes: 128 while (i < inBytes) 129 { 130 if (mySource < sourceLimit) 131 { 132 toUBytes[i] = (char) (ch2 = *mySource); 133 if (!UTF8_IS_TRAIL(ch2)) 134 { 135 break; /* i < inBytes */ 136 } 137 ch = (ch << 6) + ch2; 138 ++mySource; 139 i++; 140 } 141 else 142 { 143 /* stores a partially calculated target*/ 144 cnv->toUnicodeStatus = ch; 145 cnv->mode = inBytes; 146 cnv->toULength = (int8_t) i; 147 goto donefornow; 148 } 149 } 150 151 /* Remove the accumulated high bits */ 152 ch -= offsetsFromUTF8[inBytes]; 153 154 /* 155 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: 156 * - use only trail bytes after a lead byte (checked above) 157 * - use the right number of trail bytes for a given lead byte 158 * - encode a code point <= U+10ffff 159 * - use the fewest possible number of bytes for their code points 160 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) 161 * 162 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. 163 * There are no irregular sequences any more. 164 * In CESU-8, only surrogates, not supplementary code points, are encoded directly. 165 */ 166 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && 167 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch))) 168 { 169 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 170 if (ch <= MAXIMUM_UCS2) 171 { 172 /* fits in 16 bits */ 173 *(myTarget++) = (UChar) ch; 174 } 175 else 176 { 177 /* write out the surrogates */ 178 ch -= HALF_BASE; 179 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START); 180 ch = (ch & HALF_MASK) + SURROGATE_LOW_START; 181 if (myTarget < targetLimit) 182 { 183 *(myTarget++) = (UChar)ch; 184 } 185 else 186 { 187 /* Put in overflow buffer (not handled here) */ 188 cnv->UCharErrorBuffer[0] = (UChar) ch; 189 cnv->UCharErrorBufferLength = 1; 190 *err = U_BUFFER_OVERFLOW_ERROR; 191 break; 192 } 193 } 194 } 195 else 196 { 197 cnv->toULength = (int8_t)i; 198 *err = U_ILLEGAL_CHAR_FOUND; 199 break; 200 } 201 } 202 } 203 204 donefornow: 205 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 206 { 207 /* End of target buffer */ 208 *err = U_BUFFER_OVERFLOW_ERROR; 209 } 210 211 args->target = myTarget; 212 args->source = (const char *) mySource; 213 } 214 215 static void ucnv_toUnicode_UTF8_OFFSETS_LOGIC (UConverterToUnicodeArgs * args, 216 UErrorCode * err) 217 { 218 UConverter *cnv = args->converter; 219 const unsigned char *mySource = (unsigned char *) args->source; 220 UChar *myTarget = args->target; 221 int32_t *myOffsets = args->offsets; 222 int32_t offsetNum = 0; 223 const unsigned char *sourceLimit = (unsigned char *) args->sourceLimit; 224 const UChar *targetLimit = args->targetLimit; 225 unsigned char *toUBytes = cnv->toUBytes; 226 UBool isCESU8 = (UBool)(cnv->sharedData == &_CESU8Data); 227 uint32_t ch, ch2 = 0; 228 int32_t i, inBytes; 229 230 /* Restore size of current sequence */ 231 if (cnv->toUnicodeStatus && myTarget < targetLimit) 232 { 233 inBytes = cnv->mode; /* restore # of bytes to consume */ 234 i = cnv->toULength; /* restore # of bytes consumed */ 235 cnv->toULength = 0; 236 237 ch = cnv->toUnicodeStatus;/*Stores the previously calculated ch from a previous call*/ 238 cnv->toUnicodeStatus = 0; 239 goto morebytes; 240 } 241 242 while (mySource < sourceLimit && myTarget < targetLimit) 243 { 244 ch = *(mySource++); 245 if (ch < 0x80) /* Simple case */ 246 { 247 *(myTarget++) = (UChar) ch; 248 *(myOffsets++) = offsetNum++; 249 } 250 else 251 { 252 toUBytes[0] = (char)ch; 253 inBytes = bytesFromUTF8[ch]; 254 i = 1; 255 256 morebytes: 257 while (i < inBytes) 258 { 259 if (mySource < sourceLimit) 260 { 261 toUBytes[i] = (char) (ch2 = *mySource); 262 if (!UTF8_IS_TRAIL(ch2)) 263 { 264 break; /* i < inBytes */ 265 } 266 ch = (ch << 6) + ch2; 267 ++mySource; 268 i++; 269 } 270 else 271 { 272 cnv->toUnicodeStatus = ch; 273 cnv->mode = inBytes; 274 cnv->toULength = (int8_t)i; 275 goto donefornow; 276 } 277 } 278 279 /* Remove the accumulated high bits */ 280 ch -= offsetsFromUTF8[inBytes]; 281 282 /* 283 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: 284 * - use only trail bytes after a lead byte (checked above) 285 * - use the right number of trail bytes for a given lead byte 286 * - encode a code point <= U+10ffff 287 * - use the fewest possible number of bytes for their code points 288 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) 289 * 290 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. 291 * There are no irregular sequences any more. 292 * In CESU-8, only surrogates, not supplementary code points, are encoded directly. 293 */ 294 if (i == inBytes && ch <= MAXIMUM_UTF && ch >= utf8_minChar32[i] && 295 (isCESU8 ? i <= 3 : !UTF_IS_SURROGATE(ch))) 296 { 297 /* Normal valid byte when the loop has not prematurely terminated (i < inBytes) */ 298 if (ch <= MAXIMUM_UCS2) 299 { 300 /* fits in 16 bits */ 301 *(myTarget++) = (UChar) ch; 302 *(myOffsets++) = offsetNum; 303 } 304 else 305 { 306 /* write out the surrogates */ 307 ch -= HALF_BASE; 308 *(myTarget++) = (UChar) ((ch >> HALF_SHIFT) + SURROGATE_HIGH_START); 309 *(myOffsets++) = offsetNum; 310 ch = (ch & HALF_MASK) + SURROGATE_LOW_START; 311 if (myTarget < targetLimit) 312 { 313 *(myTarget++) = (UChar)ch; 314 *(myOffsets++) = offsetNum; 315 } 316 else 317 { 318 cnv->UCharErrorBuffer[0] = (UChar) ch; 319 cnv->UCharErrorBufferLength = 1; 320 *err = U_BUFFER_OVERFLOW_ERROR; 321 } 322 } 323 offsetNum += i; 324 } 325 else 326 { 327 cnv->toULength = (int8_t)i; 328 *err = U_ILLEGAL_CHAR_FOUND; 329 break; 330 } 331 } 332 } 333 334 donefornow: 335 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 336 { /* End of target buffer */ 337 *err = U_BUFFER_OVERFLOW_ERROR; 338 } 339 340 args->target = myTarget; 341 args->source = (const char *) mySource; 342 args->offsets = myOffsets; 343 } 344 345 U_CFUNC void ucnv_fromUnicode_UTF8 (UConverterFromUnicodeArgs * args, 346 UErrorCode * err) 347 { 348 UConverter *cnv = args->converter; 349 const UChar *mySource = args->source; 350 const UChar *sourceLimit = args->sourceLimit; 351 uint8_t *myTarget = (uint8_t *) args->target; 352 const uint8_t *targetLimit = (uint8_t *) args->targetLimit; 353 uint8_t *tempPtr; 354 UChar32 ch; 355 uint8_t tempBuf[4]; 356 int32_t indexToWrite; 357 UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data); 358 359 if (cnv->fromUChar32 && myTarget < targetLimit) 360 { 361 ch = cnv->fromUChar32; 362 cnv->fromUChar32 = 0; 363 goto lowsurrogate; 364 } 365 366 while (mySource < sourceLimit && myTarget < targetLimit) 367 { 368 ch = *(mySource++); 369 370 if (ch < 0x80) /* Single byte */ 371 { 372 *(myTarget++) = (uint8_t) ch; 373 } 374 else if (ch < 0x800) /* Double byte */ 375 { 376 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); 377 if (myTarget < targetLimit) 378 { 379 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); 380 } 381 else 382 { 383 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); 384 cnv->charErrorBufferLength = 1; 385 *err = U_BUFFER_OVERFLOW_ERROR; 386 } 387 } 388 else { 389 /* Check for surrogates */ 390 if(UTF_IS_SURROGATE(ch) && isNotCESU8) { 391 lowsurrogate: 392 if (mySource < sourceLimit) { 393 /* test both code units */ 394 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*mySource)) { 395 /* convert and consume this supplementary code point */ 396 ch=UTF16_GET_PAIR_VALUE(ch, *mySource); 397 ++mySource; 398 /* exit this condition tree */ 399 } 400 else { 401 /* this is an unpaired trail or lead code unit */ 402 /* callback(illegal) */ 403 cnv->fromUChar32 = ch; 404 *err = U_ILLEGAL_CHAR_FOUND; 405 break; 406 } 407 } 408 else { 409 /* no more input */ 410 cnv->fromUChar32 = ch; 411 break; 412 } 413 } 414 415 /* Do we write the buffer directly for speed, 416 or do we have to be careful about target buffer space? */ 417 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); 418 419 if (ch <= MAXIMUM_UCS2) { 420 indexToWrite = 2; 421 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); 422 } 423 else { 424 indexToWrite = 3; 425 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); 426 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); 427 } 428 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); 429 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); 430 431 if (tempPtr == myTarget) { 432 /* There was enough space to write the codepoint directly. */ 433 myTarget += (indexToWrite + 1); 434 } 435 else { 436 /* We might run out of room soon. Write it slowly. */ 437 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { 438 if (myTarget < targetLimit) { 439 *(myTarget++) = *tempPtr; 440 } 441 else { 442 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; 443 *err = U_BUFFER_OVERFLOW_ERROR; 444 } 445 } 446 } 447 } 448 } 449 450 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 451 { 452 *err = U_BUFFER_OVERFLOW_ERROR; 453 } 454 455 args->target = (char *) myTarget; 456 args->source = mySource; 457 } 458 459 U_CFUNC void ucnv_fromUnicode_UTF8_OFFSETS_LOGIC (UConverterFromUnicodeArgs * args, 460 UErrorCode * err) 461 { 462 UConverter *cnv = args->converter; 463 const UChar *mySource = args->source; 464 int32_t *myOffsets = args->offsets; 465 const UChar *sourceLimit = args->sourceLimit; 466 uint8_t *myTarget = (uint8_t *) args->target; 467 const uint8_t *targetLimit = (uint8_t *) args->targetLimit; 468 uint8_t *tempPtr; 469 UChar32 ch; 470 int32_t offsetNum, nextSourceIndex; 471 int32_t indexToWrite; 472 uint8_t tempBuf[4]; 473 UBool isNotCESU8 = (UBool)(cnv->sharedData != &_CESU8Data); 474 475 if (cnv->fromUChar32 && myTarget < targetLimit) 476 { 477 ch = cnv->fromUChar32; 478 cnv->fromUChar32 = 0; 479 offsetNum = -1; 480 nextSourceIndex = 0; 481 goto lowsurrogate; 482 } else { 483 offsetNum = 0; 484 } 485 486 while (mySource < sourceLimit && myTarget < targetLimit) 487 { 488 ch = *(mySource++); 489 490 if (ch < 0x80) /* Single byte */ 491 { 492 *(myOffsets++) = offsetNum++; 493 *(myTarget++) = (char) ch; 494 } 495 else if (ch < 0x800) /* Double byte */ 496 { 497 *(myOffsets++) = offsetNum; 498 *(myTarget++) = (uint8_t) ((ch >> 6) | 0xc0); 499 if (myTarget < targetLimit) 500 { 501 *(myOffsets++) = offsetNum++; 502 *(myTarget++) = (uint8_t) ((ch & 0x3f) | 0x80); 503 } 504 else 505 { 506 cnv->charErrorBuffer[0] = (uint8_t) ((ch & 0x3f) | 0x80); 507 cnv->charErrorBufferLength = 1; 508 *err = U_BUFFER_OVERFLOW_ERROR; 509 } 510 } 511 else 512 /* Check for surrogates */ 513 { 514 nextSourceIndex = offsetNum + 1; 515 516 if(UTF_IS_SURROGATE(ch) && isNotCESU8) { 517 lowsurrogate: 518 if (mySource < sourceLimit) { 519 /* test both code units */ 520 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_SECOND_SURROGATE(*mySource)) { 521 /* convert and consume this supplementary code point */ 522 ch=UTF16_GET_PAIR_VALUE(ch, *mySource); 523 ++mySource; 524 ++nextSourceIndex; 525 /* exit this condition tree */ 526 } 527 else { 528 /* this is an unpaired trail or lead code unit */ 529 /* callback(illegal) */ 530 cnv->fromUChar32 = ch; 531 *err = U_ILLEGAL_CHAR_FOUND; 532 break; 533 } 534 } 535 else { 536 /* no more input */ 537 cnv->fromUChar32 = ch; 538 break; 539 } 540 } 541 542 /* Do we write the buffer directly for speed, 543 or do we have to be careful about target buffer space? */ 544 tempPtr = (((targetLimit - myTarget) >= 4) ? myTarget : tempBuf); 545 546 if (ch <= MAXIMUM_UCS2) { 547 indexToWrite = 2; 548 tempPtr[0] = (uint8_t) ((ch >> 12) | 0xe0); 549 } 550 else { 551 indexToWrite = 3; 552 tempPtr[0] = (uint8_t) ((ch >> 18) | 0xf0); 553 tempPtr[1] = (uint8_t) (((ch >> 12) & 0x3f) | 0x80); 554 } 555 tempPtr[indexToWrite-1] = (uint8_t) (((ch >> 6) & 0x3f) | 0x80); 556 tempPtr[indexToWrite] = (uint8_t) ((ch & 0x3f) | 0x80); 557 558 if (tempPtr == myTarget) { 559 /* There was enough space to write the codepoint directly. */ 560 myTarget += (indexToWrite + 1); 561 myOffsets[0] = offsetNum; 562 myOffsets[1] = offsetNum; 563 myOffsets[2] = offsetNum; 564 if (indexToWrite >= 3) { 565 myOffsets[3] = offsetNum; 566 } 567 myOffsets += (indexToWrite + 1); 568 } 569 else { 570 /* We might run out of room soon. Write it slowly. */ 571 for (; tempPtr <= (tempBuf + indexToWrite); tempPtr++) { 572 if (myTarget < targetLimit) 573 { 574 *(myOffsets++) = offsetNum; 575 *(myTarget++) = *tempPtr; 576 } 577 else 578 { 579 cnv->charErrorBuffer[cnv->charErrorBufferLength++] = *tempPtr; 580 *err = U_BUFFER_OVERFLOW_ERROR; 581 } 582 } 583 } 584 offsetNum = nextSourceIndex; 585 } 586 } 587 588 if (mySource < sourceLimit && myTarget >= targetLimit && U_SUCCESS(*err)) 589 { 590 *err = U_BUFFER_OVERFLOW_ERROR; 591 } 592 593 args->target = (char *) myTarget; 594 args->source = mySource; 595 args->offsets = myOffsets; 596 } 597 598 static UChar32 ucnv_getNextUChar_UTF8(UConverterToUnicodeArgs *args, 599 UErrorCode *err) { 600 UConverter *cnv; 601 const uint8_t *sourceInitial; 602 const uint8_t *source; 603 uint16_t extraBytesToWrite; 604 uint8_t myByte; 605 UChar32 ch; 606 int8_t i, isLegalSequence; 607 608 /* UTF-8 only here, the framework handles CESU-8 to combine surrogate pairs */ 609 610 cnv = args->converter; 611 sourceInitial = source = (const uint8_t *)args->source; 612 if (source >= (const uint8_t *)args->sourceLimit) 613 { 614 /* no input */ 615 *err = U_INDEX_OUTOFBOUNDS_ERROR; 616 return 0xffff; 617 } 618 619 myByte = (uint8_t)*(source++); 620 if (myByte < 0x80) 621 { 622 args->source = (const char *)source; 623 return (UChar32)myByte; 624 } 625 626 extraBytesToWrite = (uint16_t)bytesFromUTF8[myByte]; 627 if (extraBytesToWrite == 0) { 628 cnv->toUBytes[0] = myByte; 629 cnv->toULength = 1; 630 *err = U_ILLEGAL_CHAR_FOUND; 631 args->source = (const char *)source; 632 return 0xffff; 633 } 634 635 /*The byte sequence is longer than the buffer area passed*/ 636 if (((const char *)source + extraBytesToWrite - 1) > args->sourceLimit) 637 { 638 /* check if all of the remaining bytes are trail bytes */ 639 cnv->toUBytes[0] = myByte; 640 i = 1; 641 *err = U_TRUNCATED_CHAR_FOUND; 642 while(source < (const uint8_t *)args->sourceLimit) { 643 if(U8_IS_TRAIL(myByte = *source)) { 644 cnv->toUBytes[i++] = myByte; 645 ++source; 646 } else { 647 /* error even before we run out of input */ 648 *err = U_ILLEGAL_CHAR_FOUND; 649 break; 650 } 651 } 652 cnv->toULength = i; 653 args->source = (const char *)source; 654 return 0xffff; 655 } 656 657 isLegalSequence = 1; 658 ch = myByte << 6; 659 switch(extraBytesToWrite) 660 { 661 /* note: code falls through cases! (sic)*/ 662 case 6: 663 ch += (myByte = *source); 664 ch <<= 6; 665 if (!UTF8_IS_TRAIL(myByte)) 666 { 667 isLegalSequence = 0; 668 break; 669 } 670 ++source; 671 case 5: 672 ch += (myByte = *source); 673 ch <<= 6; 674 if (!UTF8_IS_TRAIL(myByte)) 675 { 676 isLegalSequence = 0; 677 break; 678 } 679 ++source; 680 case 4: 681 ch += (myByte = *source); 682 ch <<= 6; 683 if (!UTF8_IS_TRAIL(myByte)) 684 { 685 isLegalSequence = 0; 686 break; 687 } 688 ++source; 689 case 3: 690 ch += (myByte = *source); 691 ch <<= 6; 692 if (!UTF8_IS_TRAIL(myByte)) 693 { 694 isLegalSequence = 0; 695 break; 696 } 697 ++source; 698 case 2: 699 ch += (myByte = *source); 700 if (!UTF8_IS_TRAIL(myByte)) 701 { 702 isLegalSequence = 0; 703 break; 704 } 705 ++source; 706 }; 707 ch -= offsetsFromUTF8[extraBytesToWrite]; 708 args->source = (const char *)source; 709 710 /* 711 * Legal UTF-8 byte sequences in Unicode 3.0.1 and up: 712 * - use only trail bytes after a lead byte (checked above) 713 * - use the right number of trail bytes for a given lead byte 714 * - encode a code point <= U+10ffff 715 * - use the fewest possible number of bytes for their code points 716 * - use at most 4 bytes (for i>=5 it is 0x10ffff<utf8_minChar32[]) 717 * 718 * Starting with Unicode 3.2, surrogate code points must not be encoded in UTF-8. 719 * There are no irregular sequences any more. 720 */ 721 if (isLegalSequence && 722 (uint32_t)ch <= MAXIMUM_UTF && 723 (uint32_t)ch >= utf8_minChar32[extraBytesToWrite] && 724 !U_IS_SURROGATE(ch) 725 ) { 726 return ch; /* return the code point */ 727 } 728 729 for(i = 0; sourceInitial < source; ++i) { 730 cnv->toUBytes[i] = *sourceInitial++; 731 } 732 cnv->toULength = i; 733 *err = U_ILLEGAL_CHAR_FOUND; 734 return 0xffff; 735 } 736 737 /* UTF-8-from-UTF-8 conversion functions ------------------------------------ */ 738 739 /* minimum code point values for n-byte UTF-8 sequences, n=0..4 */ 740 static const UChar32 741 utf8_minLegal[5]={ 0, 0, 0x80, 0x800, 0x10000 }; 742 743 /* offsets for n-byte UTF-8 sequences that were calculated with ((lead<<6)+trail)<<6+trail... */ 744 static const UChar32 745 utf8_offsets[7]={ 0, 0, 0x3080, 0xE2080, 0x3C82080 }; 746 747 /* "Convert" UTF-8 to UTF-8: Validate and copy. Modified from ucnv_DBCSFromUTF8(). */ 748 static void 749 ucnv_UTF8FromUTF8(UConverterFromUnicodeArgs *pFromUArgs, 750 UConverterToUnicodeArgs *pToUArgs, 751 UErrorCode *pErrorCode) { 752 UConverter *utf8, *cnv; 753 const uint8_t *source, *sourceLimit; 754 uint8_t *target; 755 int32_t targetCapacity; 756 int32_t count; 757 758 int8_t oldToULength, toULength, toULimit; 759 760 UChar32 c; 761 uint8_t b, t1, t2; 762 763 /* set up the local pointers */ 764 utf8=pToUArgs->converter; 765 cnv=pFromUArgs->converter; 766 source=(uint8_t *)pToUArgs->source; 767 sourceLimit=(uint8_t *)pToUArgs->sourceLimit; 768 target=(uint8_t *)pFromUArgs->target; 769 targetCapacity=(int32_t)(pFromUArgs->targetLimit-pFromUArgs->target); 770 771 /* get the converter state from the UTF-8 UConverter */ 772 c=(UChar32)utf8->toUnicodeStatus; 773 if(c!=0) { 774 toULength=oldToULength=utf8->toULength; 775 toULimit=(int8_t)utf8->mode; 776 } else { 777 toULength=oldToULength=toULimit=0; 778 } 779 780 count=(int32_t)(sourceLimit-source)+oldToULength; 781 if(count<toULimit) { 782 /* 783 * Not enough input to complete the partial character. 784 * Jump to moreBytes below - it will not output to target. 785 */ 786 } else if(targetCapacity<toULimit) { 787 /* 788 * Not enough target capacity to output the partial character. 789 * Let the standard converter handle this. 790 */ 791 *pErrorCode=U_USING_DEFAULT_WARNING; 792 return; 793 } else { 794 /* 795 * Use a single counter for source and target, counting the minimum of 796 * the source length and the target capacity. 797 * As a result, the source length is checked only once per multi-byte 798 * character instead of twice. 799 * 800 * Make sure that the last byte sequence is complete, or else 801 * stop just before it. 802 * (The longest legal byte sequence has 3 trail bytes.) 803 * Count oldToULength (number of source bytes from a previous buffer) 804 * into the source length but reduce the source index by toULimit 805 * while going back over trail bytes in order to not go back into 806 * the bytes that will be read for finishing a partial 807 * sequence from the previous buffer. 808 * Let the standard converter handle edge cases. 809 */ 810 int32_t i; 811 812 if(count>targetCapacity) { 813 count=targetCapacity; 814 } 815 816 i=0; 817 while(i<3 && i<(count-toULimit)) { 818 b=source[count-oldToULength-i-1]; 819 if(U8_IS_TRAIL(b)) { 820 ++i; 821 } else { 822 if(i<utf8_countTrailBytes[b]) { 823 /* stop converting before the lead byte if there are not enough trail bytes for it */ 824 count-=i+1; 825 } 826 break; 827 } 828 } 829 } 830 831 if(c!=0) { 832 utf8->toUnicodeStatus=0; 833 utf8->toULength=0; 834 goto moreBytes; 835 /* See note in ucnv_SBCSFromUTF8() about this goto. */ 836 } 837 838 /* conversion loop */ 839 while(count>0) { 840 b=*source++; 841 if((int8_t)b>=0) { 842 /* convert ASCII */ 843 *target++=b; 844 --count; 845 continue; 846 } else { 847 if(b>0xe0) { 848 if( /* handle U+1000..U+D7FF inline */ 849 (t1=source[0]) >= 0x80 && ((b<0xed && (t1 <= 0xbf)) || 850 (b==0xed && (t1 <= 0x9f))) && 851 (t2=source[1]) >= 0x80 && t2 <= 0xbf 852 ) { 853 source+=2; 854 *target++=b; 855 *target++=t1; 856 *target++=t2; 857 count-=3; 858 continue; 859 } 860 } else if(b<0xe0) { 861 if( /* handle U+0080..U+07FF inline */ 862 b>=0xc2 && 863 (t1=*source) >= 0x80 && t1 <= 0xbf 864 ) { 865 ++source; 866 *target++=b; 867 *target++=t1; 868 count-=2; 869 continue; 870 } 871 } else if(b==0xe0) { 872 if( /* handle U+0800..U+0FFF inline */ 873 (t1=source[0]) >= 0xa0 && t1 <= 0xbf && 874 (t2=source[1]) >= 0x80 && t2 <= 0xbf 875 ) { 876 source+=2; 877 *target++=b; 878 *target++=t1; 879 *target++=t2; 880 count-=3; 881 continue; 882 } 883 } 884 885 /* handle "complicated" and error cases, and continuing partial characters */ 886 oldToULength=0; 887 toULength=1; 888 toULimit=utf8_countTrailBytes[b]+1; 889 c=b; 890 moreBytes: 891 while(toULength<toULimit) { 892 if(source<sourceLimit) { 893 b=*source; 894 if(U8_IS_TRAIL(b)) { 895 ++source; 896 ++toULength; 897 c=(c<<6)+b; 898 } else { 899 break; /* sequence too short, stop with toULength<toULimit */ 900 } 901 } else { 902 /* store the partial UTF-8 character, compatible with the regular UTF-8 converter */ 903 source-=(toULength-oldToULength); 904 while(oldToULength<toULength) { 905 utf8->toUBytes[oldToULength++]=*source++; 906 } 907 utf8->toUnicodeStatus=c; 908 utf8->toULength=toULength; 909 utf8->mode=toULimit; 910 pToUArgs->source=(char *)source; 911 pFromUArgs->target=(char *)target; 912 return; 913 } 914 } 915 916 if( toULength==toULimit && /* consumed all trail bytes */ 917 (toULength==3 || toULength==2) && /* BMP */ 918 (c-=utf8_offsets[toULength])>=utf8_minLegal[toULength] && 919 (c<=0xd7ff || 0xe000<=c) /* not a surrogate */ 920 ) { 921 /* legal byte sequence for BMP code point */ 922 } else if( 923 toULength==toULimit && toULength==4 && 924 (0x10000<=(c-=utf8_offsets[4]) && c<=0x10ffff) 925 ) { 926 /* legal byte sequence for supplementary code point */ 927 } else { 928 /* error handling: illegal UTF-8 byte sequence */ 929 source-=(toULength-oldToULength); 930 while(oldToULength<toULength) { 931 utf8->toUBytes[oldToULength++]=*source++; 932 } 933 utf8->toULength=toULength; 934 pToUArgs->source=(char *)source; 935 pFromUArgs->target=(char *)target; 936 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 937 return; 938 } 939 940 /* copy the legal byte sequence to the target */ 941 { 942 int8_t i; 943 944 for(i=0; i<oldToULength; ++i) { 945 *target++=utf8->toUBytes[i]; 946 } 947 source-=(toULength-oldToULength); 948 for(; i<toULength; ++i) { 949 *target++=*source++; 950 } 951 count-=toULength; 952 } 953 } 954 } 955 956 if(U_SUCCESS(*pErrorCode) && source<sourceLimit) { 957 if(target==(const uint8_t *)pFromUArgs->targetLimit) { 958 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 959 } else { 960 b=*source; 961 toULimit=utf8_countTrailBytes[b]+1; 962 if(toULimit>(sourceLimit-source)) { 963 /* collect a truncated byte sequence */ 964 toULength=0; 965 c=b; 966 for(;;) { 967 utf8->toUBytes[toULength++]=b; 968 if(++source==sourceLimit) { 969 /* partial byte sequence at end of source */ 970 utf8->toUnicodeStatus=c; 971 utf8->toULength=toULength; 972 utf8->mode=toULimit; 973 break; 974 } else if(!U8_IS_TRAIL(b=*source)) { 975 /* lead byte in trail byte position */ 976 utf8->toULength=toULength; 977 *pErrorCode=U_ILLEGAL_CHAR_FOUND; 978 break; 979 } 980 c=(c<<6)+b; 981 } 982 } else { 983 /* partial-sequence target overflow: fall back to the pivoting implementation */ 984 *pErrorCode=U_USING_DEFAULT_WARNING; 985 } 986 } 987 } 988 989 /* write back the updated pointers */ 990 pToUArgs->source=(char *)source; 991 pFromUArgs->target=(char *)target; 992 } 993 994 /* UTF-8 converter data ----------------------------------------------------- */ 995 996 static const UConverterImpl _UTF8Impl={ 997 UCNV_UTF8, 998 999 NULL, 1000 NULL, 1001 1002 NULL, 1003 NULL, 1004 NULL, 1005 1006 ucnv_toUnicode_UTF8, 1007 ucnv_toUnicode_UTF8_OFFSETS_LOGIC, 1008 ucnv_fromUnicode_UTF8, 1009 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 1010 ucnv_getNextUChar_UTF8, 1011 1012 NULL, 1013 NULL, 1014 NULL, 1015 NULL, 1016 ucnv_getNonSurrogateUnicodeSet, 1017 1018 ucnv_UTF8FromUTF8, 1019 ucnv_UTF8FromUTF8 1020 }; 1021 1022 /* The 1208 CCSID refers to any version of Unicode of UTF-8 */ 1023 static const UConverterStaticData _UTF8StaticData={ 1024 sizeof(UConverterStaticData), 1025 "UTF-8", 1026 1208, UCNV_IBM, UCNV_UTF8, 1027 1, 3, /* max 3 bytes per UChar from UTF-8 (4 bytes from surrogate _pair_) */ 1028 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, 1029 0, 1030 0, 1031 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1032 }; 1033 1034 1035 const UConverterSharedData _UTF8Data={ 1036 sizeof(UConverterSharedData), ~((uint32_t) 0), 1037 NULL, NULL, &_UTF8StaticData, FALSE, &_UTF8Impl, 1038 0 1039 }; 1040 1041 /* CESU-8 converter data ---------------------------------------------------- */ 1042 1043 static const UConverterImpl _CESU8Impl={ 1044 UCNV_CESU8, 1045 1046 NULL, 1047 NULL, 1048 1049 NULL, 1050 NULL, 1051 NULL, 1052 1053 ucnv_toUnicode_UTF8, 1054 ucnv_toUnicode_UTF8_OFFSETS_LOGIC, 1055 ucnv_fromUnicode_UTF8, 1056 ucnv_fromUnicode_UTF8_OFFSETS_LOGIC, 1057 NULL, 1058 1059 NULL, 1060 NULL, 1061 NULL, 1062 NULL, 1063 ucnv_getCompleteUnicodeSet 1064 }; 1065 1066 static const UConverterStaticData _CESU8StaticData={ 1067 sizeof(UConverterStaticData), 1068 "CESU-8", 1069 9400, /* CCSID for CESU-8 */ 1070 UCNV_UNKNOWN, UCNV_CESU8, 1, 3, 1071 { 0xef, 0xbf, 0xbd, 0 },3,FALSE,FALSE, 1072 0, 1073 0, 1074 { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ 1075 }; 1076 1077 1078 const UConverterSharedData _CESU8Data={ 1079 sizeof(UConverterSharedData), ~((uint32_t) 0), 1080 NULL, NULL, &_CESU8StaticData, FALSE, &_CESU8Impl, 1081 0 1082 }; 1083 1084 #endif 1085