1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2001-2009, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * 9 * File ustrtrns.c 10 * 11 * Modification History: 12 * 13 * Date Name Description 14 * 9/10/2001 Ram Creation. 15 ****************************************************************************** 16 */ 17 18 /******************************************************************************* 19 * 20 * u_strTo* and u_strFrom* APIs 21 * WCS functions moved to ustr_wcs.c for better modularization 22 * 23 ******************************************************************************* 24 */ 25 26 27 #include "unicode/putil.h" 28 #include "unicode/ustring.h" 29 #include "cstring.h" 30 #include "cmemory.h" 31 #include "ustr_imp.h" 32 33 U_CAPI UChar* U_EXPORT2 34 u_strFromUTF32WithSub(UChar *dest, 35 int32_t destCapacity, 36 int32_t *pDestLength, 37 const UChar32 *src, 38 int32_t srcLength, 39 UChar32 subchar, int32_t *pNumSubstitutions, 40 UErrorCode *pErrorCode) { 41 const UChar32 *srcLimit; 42 UChar32 ch; 43 UChar *destLimit; 44 UChar *pDest; 45 int32_t reqLength; 46 int32_t numSubstitutions; 47 48 /* args check */ 49 if(U_FAILURE(*pErrorCode)){ 50 return NULL; 51 } 52 if( (src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0) || 53 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 54 ) { 55 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 56 return NULL; 57 } 58 59 if(pNumSubstitutions != NULL) { 60 *pNumSubstitutions = 0; 61 } 62 63 pDest = dest; 64 destLimit = dest + destCapacity; 65 reqLength = 0; 66 numSubstitutions = 0; 67 68 if(srcLength < 0) { 69 /* simple loop for conversion of a NUL-terminated BMP string */ 70 while((ch=*src) != 0 && 71 ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) { 72 ++src; 73 if(pDest < destLimit) { 74 *pDest++ = (UChar)ch; 75 } else { 76 ++reqLength; 77 } 78 } 79 srcLimit = src; 80 if(ch != 0) { 81 /* "complicated" case, find the end of the remaining string */ 82 while(*++srcLimit != 0) {} 83 } 84 } else { 85 srcLimit = src + srcLength; 86 } 87 88 /* convert with length */ 89 while(src < srcLimit) { 90 ch = *src++; 91 do { 92 /* usually "loops" once; twice only for writing subchar */ 93 if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) { 94 if(pDest < destLimit) { 95 *pDest++ = (UChar)ch; 96 } else { 97 ++reqLength; 98 } 99 break; 100 } else if(0x10000 <= ch && ch <= 0x10ffff) { 101 if((pDest + 2) <= destLimit) { 102 *pDest++ = U16_LEAD(ch); 103 *pDest++ = U16_TRAIL(ch); 104 } else { 105 reqLength += 2; 106 } 107 break; 108 } else if((ch = subchar) < 0) { 109 /* surrogate code point, or not a Unicode code point at all */ 110 *pErrorCode = U_INVALID_CHAR_FOUND; 111 return NULL; 112 } else { 113 ++numSubstitutions; 114 } 115 } while(TRUE); 116 } 117 118 reqLength += (int32_t)(pDest - dest); 119 if(pDestLength) { 120 *pDestLength = reqLength; 121 } 122 if(pNumSubstitutions != NULL) { 123 *pNumSubstitutions = numSubstitutions; 124 } 125 126 /* Terminate the buffer */ 127 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 128 129 return dest; 130 } 131 132 U_CAPI UChar* U_EXPORT2 133 u_strFromUTF32(UChar *dest, 134 int32_t destCapacity, 135 int32_t *pDestLength, 136 const UChar32 *src, 137 int32_t srcLength, 138 UErrorCode *pErrorCode) { 139 return u_strFromUTF32WithSub( 140 dest, destCapacity, pDestLength, 141 src, srcLength, 142 U_SENTINEL, NULL, 143 pErrorCode); 144 } 145 146 U_CAPI UChar32* U_EXPORT2 147 u_strToUTF32WithSub(UChar32 *dest, 148 int32_t destCapacity, 149 int32_t *pDestLength, 150 const UChar *src, 151 int32_t srcLength, 152 UChar32 subchar, int32_t *pNumSubstitutions, 153 UErrorCode *pErrorCode) { 154 const UChar *srcLimit; 155 UChar32 ch; 156 UChar ch2; 157 UChar32 *destLimit; 158 UChar32 *pDest; 159 int32_t reqLength; 160 int32_t numSubstitutions; 161 162 /* args check */ 163 if(U_FAILURE(*pErrorCode)){ 164 return NULL; 165 } 166 if( (src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0) || 167 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 168 ) { 169 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 170 return NULL; 171 } 172 173 if(pNumSubstitutions != NULL) { 174 *pNumSubstitutions = 0; 175 } 176 177 pDest = dest; 178 destLimit = dest + destCapacity; 179 reqLength = 0; 180 numSubstitutions = 0; 181 182 if(srcLength < 0) { 183 /* simple loop for conversion of a NUL-terminated BMP string */ 184 while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) { 185 ++src; 186 if(pDest < destLimit) { 187 *pDest++ = ch; 188 } else { 189 ++reqLength; 190 } 191 } 192 srcLimit = src; 193 if(ch != 0) { 194 /* "complicated" case, find the end of the remaining string */ 195 while(*++srcLimit != 0) {} 196 } 197 } else { 198 srcLimit = src + srcLength; 199 } 200 201 /* convert with length */ 202 while(src < srcLimit) { 203 ch = *src++; 204 if(!U16_IS_SURROGATE(ch)) { 205 /* write or count ch below */ 206 } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) { 207 ++src; 208 ch = U16_GET_SUPPLEMENTARY(ch, ch2); 209 } else if((ch = subchar) < 0) { 210 /* unpaired surrogate */ 211 *pErrorCode = U_INVALID_CHAR_FOUND; 212 return NULL; 213 } else { 214 ++numSubstitutions; 215 } 216 if(pDest < destLimit) { 217 *pDest++ = ch; 218 } else { 219 ++reqLength; 220 } 221 } 222 223 reqLength += (int32_t)(pDest - dest); 224 if(pDestLength) { 225 *pDestLength = reqLength; 226 } 227 if(pNumSubstitutions != NULL) { 228 *pNumSubstitutions = numSubstitutions; 229 } 230 231 /* Terminate the buffer */ 232 u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode); 233 234 return dest; 235 } 236 237 U_CAPI UChar32* U_EXPORT2 238 u_strToUTF32(UChar32 *dest, 239 int32_t destCapacity, 240 int32_t *pDestLength, 241 const UChar *src, 242 int32_t srcLength, 243 UErrorCode *pErrorCode) { 244 return u_strToUTF32WithSub( 245 dest, destCapacity, pDestLength, 246 src, srcLength, 247 U_SENTINEL, NULL, 248 pErrorCode); 249 } 250 251 /* for utf8_nextCharSafeBodyTerminated() */ 252 static const UChar32 253 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; 254 255 /* 256 * Version of utf8_nextCharSafeBody() with the following differences: 257 * - checks for NUL termination instead of length 258 * - works with pointers instead of indexes 259 * - always strict (strict==-1) 260 * 261 * *ps points to after the lead byte and will be moved to after the last trail byte. 262 * c is the lead byte. 263 * @return the code point, or U_SENTINEL 264 */ 265 static UChar32 266 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) { 267 const uint8_t *s=*ps; 268 uint8_t trail, illegal=0; 269 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c); 270 UTF8_MASK_LEAD_BYTE((c), count); 271 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ 272 switch(count) { 273 /* each branch falls through to the next one */ 274 case 5: 275 case 4: 276 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ 277 illegal=1; 278 break; 279 case 3: 280 trail=(uint8_t)(*s++ - 0x80); 281 c=(c<<6)|trail; 282 if(trail>0x3f || c>=0x110) { 283 /* not a trail byte, or code point>0x10ffff (outside Unicode) */ 284 illegal=1; 285 break; 286 } 287 case 2: 288 trail=(uint8_t)(*s++ - 0x80); 289 if(trail>0x3f) { 290 /* not a trail byte */ 291 illegal=1; 292 break; 293 } 294 c=(c<<6)|trail; 295 case 1: 296 trail=(uint8_t)(*s++ - 0x80); 297 if(trail>0x3f) { 298 /* not a trail byte */ 299 illegal=1; 300 } 301 c=(c<<6)|trail; 302 break; 303 case 0: 304 return U_SENTINEL; 305 /* no default branch to optimize switch() - all values are covered */ 306 } 307 308 /* correct sequence - all trail bytes have (b7..b6)==(10)? */ 309 /* illegal is also set if count>=4 */ 310 if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) { 311 /* error handling */ 312 /* don't go beyond this sequence */ 313 s=*ps; 314 while(count>0 && UTF8_IS_TRAIL(*s)) { 315 ++s; 316 --count; 317 } 318 c=U_SENTINEL; 319 } 320 *ps=s; 321 return c; 322 } 323 324 /* 325 * Version of utf8_nextCharSafeBody() with the following differences: 326 * - works with pointers instead of indexes 327 * - always strict (strict==-1) 328 * 329 * *ps points to after the lead byte and will be moved to after the last trail byte. 330 * c is the lead byte. 331 * @return the code point, or U_SENTINEL 332 */ 333 static UChar32 334 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) { 335 const uint8_t *s=*ps; 336 uint8_t trail, illegal=0; 337 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c); 338 if((limit-s)>=count) { 339 UTF8_MASK_LEAD_BYTE((c), count); 340 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ 341 switch(count) { 342 /* each branch falls through to the next one */ 343 case 5: 344 case 4: 345 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ 346 illegal=1; 347 break; 348 case 3: 349 trail=*s++; 350 c=(c<<6)|(trail&0x3f); 351 if(c<0x110) { 352 illegal|=(trail&0xc0)^0x80; 353 } else { 354 /* code point>0x10ffff, outside Unicode */ 355 illegal=1; 356 break; 357 } 358 case 2: 359 trail=*s++; 360 c=(c<<6)|(trail&0x3f); 361 illegal|=(trail&0xc0)^0x80; 362 case 1: 363 trail=*s++; 364 c=(c<<6)|(trail&0x3f); 365 illegal|=(trail&0xc0)^0x80; 366 break; 367 case 0: 368 return U_SENTINEL; 369 /* no default branch to optimize switch() - all values are covered */ 370 } 371 } else { 372 illegal=1; /* too few bytes left */ 373 } 374 375 /* correct sequence - all trail bytes have (b7..b6)==(10)? */ 376 /* illegal is also set if count>=4 */ 377 if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) { 378 /* error handling */ 379 /* don't go beyond this sequence */ 380 s=*ps; 381 while(count>0 && s<limit && UTF8_IS_TRAIL(*s)) { 382 ++s; 383 --count; 384 } 385 c=U_SENTINEL; 386 } 387 *ps=s; 388 return c; 389 } 390 391 U_CAPI UChar* U_EXPORT2 392 u_strFromUTF8WithSub(UChar *dest, 393 int32_t destCapacity, 394 int32_t *pDestLength, 395 const char* src, 396 int32_t srcLength, 397 UChar32 subchar, int32_t *pNumSubstitutions, 398 UErrorCode *pErrorCode){ 399 400 UChar *pDest = dest; 401 UChar *pDestLimit = dest+destCapacity; 402 UChar32 ch; 403 int32_t reqLength = 0; 404 const uint8_t* pSrc = (const uint8_t*) src; 405 uint8_t t1, t2; /* trail bytes */ 406 int32_t numSubstitutions; 407 408 /* args check */ 409 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 410 return NULL; 411 } 412 413 if( (src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0) || 414 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 415 ) { 416 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 417 return NULL; 418 } 419 420 if(pNumSubstitutions!=NULL) { 421 *pNumSubstitutions=0; 422 } 423 numSubstitutions=0; 424 425 /* 426 * Inline processing of UTF-8 byte sequences: 427 * 428 * Byte sequences for the most common characters are handled inline in 429 * the conversion loops. In order to reduce the path lengths for those 430 * characters, the tests are arranged in a kind of binary search. 431 * ASCII (<=0x7f) is checked first, followed by the dividing point 432 * between 2- and 3-byte sequences (0xe0). 433 * The 3-byte branch is tested first to speed up CJK text. 434 * The compiler should combine the subtractions for the two tests for 0xe0. 435 * Each branch then tests for the other end of its range. 436 */ 437 438 if(srcLength < 0){ 439 /* 440 * Transform a NUL-terminated string. 441 * The code explicitly checks for NULs only in the lead byte position. 442 * A NUL byte in the trail byte position fails the trail byte range check anyway. 443 */ 444 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { 445 if(ch <= 0x7f){ 446 *pDest++=(UChar)ch; 447 ++pSrc; 448 } else { 449 if(ch > 0xe0) { 450 if( /* handle U+1000..U+CFFF inline */ 451 ch <= 0xec && 452 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 453 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 454 ) { 455 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 456 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 457 pSrc += 3; 458 continue; 459 } 460 } else if(ch < 0xe0) { 461 if( /* handle U+0080..U+07FF inline */ 462 ch >= 0xc2 && 463 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 464 ) { 465 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 466 pSrc += 2; 467 continue; 468 } 469 } 470 471 /* function call for "complicated" and error cases */ 472 ++pSrc; /* continue after the lead byte */ 473 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); 474 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { 475 *pErrorCode = U_INVALID_CHAR_FOUND; 476 return NULL; 477 } else if(ch<=0xFFFF) { 478 *(pDest++)=(UChar)ch; 479 } else { 480 *(pDest++)=UTF16_LEAD(ch); 481 if(pDest<pDestLimit) { 482 *(pDest++)=UTF16_TRAIL(ch); 483 } else { 484 reqLength++; 485 break; 486 } 487 } 488 } 489 } 490 491 /* Pre-flight the rest of the string. */ 492 while((ch = *pSrc) != 0) { 493 if(ch <= 0x7f){ 494 ++reqLength; 495 ++pSrc; 496 } else { 497 if(ch > 0xe0) { 498 if( /* handle U+1000..U+CFFF inline */ 499 ch <= 0xec && 500 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 501 (uint8_t)(pSrc[2] - 0x80) <= 0x3f 502 ) { 503 ++reqLength; 504 pSrc += 3; 505 continue; 506 } 507 } else if(ch < 0xe0) { 508 if( /* handle U+0080..U+07FF inline */ 509 ch >= 0xc2 && 510 (uint8_t)(pSrc[1] - 0x80) <= 0x3f 511 ) { 512 ++reqLength; 513 pSrc += 2; 514 continue; 515 } 516 } 517 518 /* function call for "complicated" and error cases */ 519 ++pSrc; /* continue after the lead byte */ 520 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); 521 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { 522 *pErrorCode = U_INVALID_CHAR_FOUND; 523 return NULL; 524 } 525 reqLength += U16_LENGTH(ch); 526 } 527 } 528 } else /* srcLength >= 0 */ { 529 const uint8_t *pSrcLimit = pSrc + srcLength; 530 int32_t count; 531 532 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 533 for(;;) { 534 /* 535 * Each iteration of the inner loop progresses by at most 3 UTF-8 536 * bytes and one UChar, for most characters. 537 * For supplementary code points (4 & 2), which are rare, 538 * there is an additional adjustment. 539 */ 540 count = (int32_t)(pDestLimit - pDest); 541 srcLength = (int32_t)((pSrcLimit - pSrc) / 3); 542 if(count > srcLength) { 543 count = srcLength; /* min(remaining dest, remaining src/3) */ 544 } 545 if(count < 3) { 546 /* 547 * Too much overhead if we get near the end of the string, 548 * continue with the next loop. 549 */ 550 break; 551 } 552 553 do { 554 ch = *pSrc; 555 if(ch <= 0x7f){ 556 *pDest++=(UChar)ch; 557 ++pSrc; 558 } else { 559 if(ch > 0xe0) { 560 if( /* handle U+1000..U+CFFF inline */ 561 ch <= 0xec && 562 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 563 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 564 ) { 565 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 566 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 567 pSrc += 3; 568 continue; 569 } 570 } else if(ch < 0xe0) { 571 if( /* handle U+0080..U+07FF inline */ 572 ch >= 0xc2 && 573 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 574 ) { 575 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 576 pSrc += 2; 577 continue; 578 } 579 } 580 581 if(ch >= 0xf0 || subchar > 0xffff) { 582 /* 583 * We may read up to six bytes and write up to two UChars, 584 * which we didn't account for with computing count, 585 * so we adjust it here. 586 */ 587 if(--count == 0) { 588 break; 589 } 590 } 591 592 /* function call for "complicated" and error cases */ 593 ++pSrc; /* continue after the lead byte */ 594 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 595 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 596 *pErrorCode = U_INVALID_CHAR_FOUND; 597 return NULL; 598 }else if(ch<=0xFFFF){ 599 *(pDest++)=(UChar)ch; 600 }else{ 601 *(pDest++)=UTF16_LEAD(ch); 602 if(pDest<pDestLimit){ 603 *(pDest++)=UTF16_TRAIL(ch); 604 }else{ 605 reqLength++; 606 break; 607 } 608 } 609 } 610 } while(--count > 0); 611 } 612 613 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { 614 ch = *pSrc; 615 if(ch <= 0x7f){ 616 *pDest++=(UChar)ch; 617 ++pSrc; 618 } else { 619 if(ch > 0xe0) { 620 if( /* handle U+1000..U+CFFF inline */ 621 ch <= 0xec && 622 ((pSrcLimit - pSrc) >= 3) && 623 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 624 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 625 ) { 626 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 627 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 628 pSrc += 3; 629 continue; 630 } 631 } else if(ch < 0xe0) { 632 if( /* handle U+0080..U+07FF inline */ 633 ch >= 0xc2 && 634 ((pSrcLimit - pSrc) >= 2) && 635 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 636 ) { 637 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 638 pSrc += 2; 639 continue; 640 } 641 } 642 643 /* function call for "complicated" and error cases */ 644 ++pSrc; /* continue after the lead byte */ 645 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 646 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 647 *pErrorCode = U_INVALID_CHAR_FOUND; 648 return NULL; 649 }else if(ch<=0xFFFF){ 650 *(pDest++)=(UChar)ch; 651 }else{ 652 *(pDest++)=UTF16_LEAD(ch); 653 if(pDest<pDestLimit){ 654 *(pDest++)=UTF16_TRAIL(ch); 655 }else{ 656 reqLength++; 657 break; 658 } 659 } 660 } 661 } 662 /* donot fill the dest buffer just count the UChars needed */ 663 while(pSrc < pSrcLimit){ 664 ch = *pSrc; 665 if(ch <= 0x7f){ 666 reqLength++; 667 ++pSrc; 668 } else { 669 if(ch > 0xe0) { 670 if( /* handle U+1000..U+CFFF inline */ 671 ch <= 0xec && 672 ((pSrcLimit - pSrc) >= 3) && 673 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 674 (uint8_t)(pSrc[2] - 0x80) <= 0x3f 675 ) { 676 reqLength++; 677 pSrc += 3; 678 continue; 679 } 680 } else if(ch < 0xe0) { 681 if( /* handle U+0080..U+07FF inline */ 682 ch >= 0xc2 && 683 ((pSrcLimit - pSrc) >= 2) && 684 (uint8_t)(pSrc[1] - 0x80) <= 0x3f 685 ) { 686 reqLength++; 687 pSrc += 2; 688 continue; 689 } 690 } 691 692 /* function call for "complicated" and error cases */ 693 ++pSrc; /* continue after the lead byte */ 694 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 695 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 696 *pErrorCode = U_INVALID_CHAR_FOUND; 697 return NULL; 698 } 699 reqLength+=UTF_CHAR_LENGTH(ch); 700 } 701 } 702 } 703 704 reqLength+=(int32_t)(pDest - dest); 705 706 if(pNumSubstitutions!=NULL) { 707 *pNumSubstitutions=numSubstitutions; 708 } 709 710 if(pDestLength){ 711 *pDestLength = reqLength; 712 } 713 714 /* Terminate the buffer */ 715 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 716 717 return dest; 718 } 719 720 U_CAPI UChar* U_EXPORT2 721 u_strFromUTF8(UChar *dest, 722 int32_t destCapacity, 723 int32_t *pDestLength, 724 const char* src, 725 int32_t srcLength, 726 UErrorCode *pErrorCode){ 727 return u_strFromUTF8WithSub( 728 dest, destCapacity, pDestLength, 729 src, srcLength, 730 U_SENTINEL, NULL, 731 pErrorCode); 732 } 733 734 U_CAPI UChar * U_EXPORT2 735 u_strFromUTF8Lenient(UChar *dest, 736 int32_t destCapacity, 737 int32_t *pDestLength, 738 const char *src, 739 int32_t srcLength, 740 UErrorCode *pErrorCode) { 741 742 UChar *pDest = dest; 743 UChar32 ch; 744 int32_t reqLength = 0; 745 uint8_t* pSrc = (uint8_t*) src; 746 747 /* args check */ 748 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 749 return NULL; 750 } 751 752 if((src==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0)) { 753 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 754 return NULL; 755 } 756 757 if(srcLength < 0) { 758 /* Transform a NUL-terminated string. */ 759 UChar *pDestLimit = dest+destCapacity; 760 uint8_t t1, t2, t3; /* trail bytes */ 761 762 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { 763 if(ch < 0xc0) { 764 /* 765 * ASCII, or a trail byte in lead position which is treated like 766 * a single-byte sequence for better character boundary 767 * resynchronization after illegal sequences. 768 */ 769 *pDest++=(UChar)ch; 770 ++pSrc; 771 continue; 772 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 773 if((t1 = pSrc[1]) != 0) { 774 /* 0x3080 = (0xc0 << 6) + 0x80 */ 775 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080); 776 pSrc += 2; 777 continue; 778 } 779 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 780 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) { 781 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 782 /* 0x2080 = (0x80 << 6) + 0x80 */ 783 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080); 784 pSrc += 3; 785 continue; 786 } 787 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 788 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) { 789 pSrc += 4; 790 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 791 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080; 792 *(pDest++) = U16_LEAD(ch); 793 if(pDest < pDestLimit) { 794 *(pDest++) = U16_TRAIL(ch); 795 } else { 796 reqLength = 1; 797 break; 798 } 799 continue; 800 } 801 } 802 803 /* truncated character at the end */ 804 *pDest++ = 0xfffd; 805 while(*++pSrc != 0) {} 806 break; 807 } 808 809 /* Pre-flight the rest of the string. */ 810 while((ch = *pSrc) != 0) { 811 if(ch < 0xc0) { 812 /* 813 * ASCII, or a trail byte in lead position which is treated like 814 * a single-byte sequence for better character boundary 815 * resynchronization after illegal sequences. 816 */ 817 ++reqLength; 818 ++pSrc; 819 continue; 820 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 821 if(pSrc[1] != 0) { 822 ++reqLength; 823 pSrc += 2; 824 continue; 825 } 826 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 827 if(pSrc[1] != 0 && pSrc[2] != 0) { 828 ++reqLength; 829 pSrc += 3; 830 continue; 831 } 832 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 833 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) { 834 reqLength += 2; 835 pSrc += 4; 836 continue; 837 } 838 } 839 840 /* truncated character at the end */ 841 ++reqLength; 842 break; 843 } 844 } else /* srcLength >= 0 */ { 845 const uint8_t *pSrcLimit = pSrc + srcLength; 846 847 /* 848 * This function requires that if srcLength is given, then it must be 849 * destCapatity >= srcLength so that we need not check for 850 * destination buffer overflow in the loop. 851 */ 852 if(destCapacity < srcLength) { 853 if(pDestLength != NULL) { 854 *pDestLength = srcLength; /* this likely overestimates the true destLength! */ 855 } 856 *pErrorCode = U_BUFFER_OVERFLOW_ERROR; 857 return NULL; 858 } 859 860 if((pSrcLimit - pSrc) >= 4) { 861 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */ 862 863 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */ 864 do { 865 ch = *pSrc++; 866 if(ch < 0xc0) { 867 /* 868 * ASCII, or a trail byte in lead position which is treated like 869 * a single-byte sequence for better character boundary 870 * resynchronization after illegal sequences. 871 */ 872 *pDest++=(UChar)ch; 873 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 874 /* 0x3080 = (0xc0 << 6) + 0x80 */ 875 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); 876 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 877 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 878 /* 0x2080 = (0x80 << 6) + 0x80 */ 879 ch = (ch << 12) + (*pSrc++ << 6); 880 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); 881 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 882 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 883 ch = (ch << 18) + (*pSrc++ << 12); 884 ch += *pSrc++ << 6; 885 ch += *pSrc++ - 0x3c82080; 886 *(pDest++) = U16_LEAD(ch); 887 *(pDest++) = U16_TRAIL(ch); 888 } 889 } while(pSrc < pSrcLimit); 890 891 pSrcLimit += 3; /* restore original pSrcLimit */ 892 } 893 894 while(pSrc < pSrcLimit) { 895 ch = *pSrc++; 896 if(ch < 0xc0) { 897 /* 898 * ASCII, or a trail byte in lead position which is treated like 899 * a single-byte sequence for better character boundary 900 * resynchronization after illegal sequences. 901 */ 902 *pDest++=(UChar)ch; 903 continue; 904 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 905 if(pSrc < pSrcLimit) { 906 /* 0x3080 = (0xc0 << 6) + 0x80 */ 907 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); 908 continue; 909 } 910 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 911 if((pSrcLimit - pSrc) >= 2) { 912 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 913 /* 0x2080 = (0x80 << 6) + 0x80 */ 914 ch = (ch << 12) + (*pSrc++ << 6); 915 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); 916 pSrc += 3; 917 continue; 918 } 919 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 920 if((pSrcLimit - pSrc) >= 3) { 921 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 922 ch = (ch << 18) + (*pSrc++ << 12); 923 ch += *pSrc++ << 6; 924 ch += *pSrc++ - 0x3c82080; 925 *(pDest++) = U16_LEAD(ch); 926 *(pDest++) = U16_TRAIL(ch); 927 pSrc += 4; 928 continue; 929 } 930 } 931 932 /* truncated character at the end */ 933 *pDest++ = 0xfffd; 934 break; 935 } 936 } 937 938 reqLength+=(int32_t)(pDest - dest); 939 940 if(pDestLength){ 941 *pDestLength = reqLength; 942 } 943 944 /* Terminate the buffer */ 945 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 946 947 return dest; 948 } 949 950 static U_INLINE uint8_t * 951 _appendUTF8(uint8_t *pDest, UChar32 c) { 952 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */ 953 if((c)<=0x7f) { 954 *pDest++=(uint8_t)c; 955 } else if(c<=0x7ff) { 956 *pDest++=(uint8_t)((c>>6)|0xc0); 957 *pDest++=(uint8_t)((c&0x3f)|0x80); 958 } else if(c<=0xffff) { 959 *pDest++=(uint8_t)((c>>12)|0xe0); 960 *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80); 961 *pDest++=(uint8_t)(((c)&0x3f)|0x80); 962 } else /* if((uint32_t)(c)<=0x10ffff) */ { 963 *pDest++=(uint8_t)(((c)>>18)|0xf0); 964 *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80); 965 *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80); 966 *pDest++=(uint8_t)(((c)&0x3f)|0x80); 967 } 968 return pDest; 969 } 970 971 972 U_CAPI char* U_EXPORT2 973 u_strToUTF8WithSub(char *dest, 974 int32_t destCapacity, 975 int32_t *pDestLength, 976 const UChar *pSrc, 977 int32_t srcLength, 978 UChar32 subchar, int32_t *pNumSubstitutions, 979 UErrorCode *pErrorCode){ 980 981 int32_t reqLength=0; 982 uint32_t ch=0,ch2=0; 983 uint8_t *pDest = (uint8_t *)dest; 984 uint8_t *pDestLimit = pDest + destCapacity; 985 int32_t numSubstitutions; 986 987 /* args check */ 988 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 989 return NULL; 990 } 991 992 if( (pSrc==NULL) || (srcLength < -1) || (destCapacity<0) || (!dest && destCapacity > 0) || 993 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 994 ) { 995 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 996 return NULL; 997 } 998 999 if(pNumSubstitutions!=NULL) { 1000 *pNumSubstitutions=0; 1001 } 1002 numSubstitutions=0; 1003 1004 if(srcLength==-1) { 1005 while((ch=*pSrc)!=0) { 1006 ++pSrc; 1007 if(ch <= 0x7f) { 1008 if(pDest<pDestLimit) { 1009 *pDest++ = (char)ch; 1010 } else { 1011 reqLength = 1; 1012 break; 1013 } 1014 } else if(ch <= 0x7ff) { 1015 if((pDestLimit - pDest) >= 2) { 1016 *pDest++=(uint8_t)((ch>>6)|0xc0); 1017 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1018 } else { 1019 reqLength = 2; 1020 break; 1021 } 1022 } else if(ch <= 0xd7ff || ch >= 0xe000) { 1023 if((pDestLimit - pDest) >= 3) { 1024 *pDest++=(uint8_t)((ch>>12)|0xe0); 1025 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1026 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1027 } else { 1028 reqLength = 3; 1029 break; 1030 } 1031 } else /* ch is a surrogate */ { 1032 int32_t length; 1033 1034 /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/ 1035 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { 1036 ++pSrc; 1037 ch=UTF16_GET_PAIR_VALUE(ch, ch2); 1038 } else if(subchar>=0) { 1039 ch=subchar; 1040 ++numSubstitutions; 1041 } else { 1042 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1043 *pErrorCode = U_INVALID_CHAR_FOUND; 1044 return NULL; 1045 } 1046 1047 length = U8_LENGTH(ch); 1048 if((pDestLimit - pDest) >= length) { 1049 /* convert and append*/ 1050 pDest=_appendUTF8(pDest, ch); 1051 } else { 1052 reqLength = length; 1053 break; 1054 } 1055 } 1056 } 1057 while((ch=*pSrc++)!=0) { 1058 if(ch<=0x7f) { 1059 ++reqLength; 1060 } else if(ch<=0x7ff) { 1061 reqLength+=2; 1062 } else if(!UTF_IS_SURROGATE(ch)) { 1063 reqLength+=3; 1064 } else if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { 1065 ++pSrc; 1066 reqLength+=4; 1067 } else if(subchar>=0) { 1068 reqLength+=U8_LENGTH(subchar); 1069 ++numSubstitutions; 1070 } else { 1071 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1072 *pErrorCode = U_INVALID_CHAR_FOUND; 1073 return NULL; 1074 } 1075 } 1076 } else { 1077 const UChar *pSrcLimit = pSrc+srcLength; 1078 int32_t count; 1079 1080 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1081 for(;;) { 1082 /* 1083 * Each iteration of the inner loop progresses by at most 3 UTF-8 1084 * bytes and one UChar, for most characters. 1085 * For supplementary code points (4 & 2), which are rare, 1086 * there is an additional adjustment. 1087 */ 1088 count = (int32_t)((pDestLimit - pDest) / 3); 1089 srcLength = (int32_t)(pSrcLimit - pSrc); 1090 if(count > srcLength) { 1091 count = srcLength; /* min(remaining dest/3, remaining src) */ 1092 } 1093 if(count < 3) { 1094 /* 1095 * Too much overhead if we get near the end of the string, 1096 * continue with the next loop. 1097 */ 1098 break; 1099 } 1100 do { 1101 ch=*pSrc++; 1102 if(ch <= 0x7f) { 1103 *pDest++ = (char)ch; 1104 } else if(ch <= 0x7ff) { 1105 *pDest++=(uint8_t)((ch>>6)|0xc0); 1106 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1107 } else if(ch <= 0xd7ff || ch >= 0xe000) { 1108 *pDest++=(uint8_t)((ch>>12)|0xe0); 1109 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1110 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1111 } else /* ch is a surrogate */ { 1112 /* 1113 * We will read two UChars and probably output four bytes, 1114 * which we didn't account for with computing count, 1115 * so we adjust it here. 1116 */ 1117 if(--count == 0) { 1118 --pSrc; /* undo ch=*pSrc++ for the lead surrogate */ 1119 break; /* recompute count */ 1120 } 1121 1122 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { 1123 ++pSrc; 1124 ch=UTF16_GET_PAIR_VALUE(ch, ch2); 1125 1126 /* writing 4 bytes per 2 UChars is ok */ 1127 *pDest++=(uint8_t)((ch>>18)|0xf0); 1128 *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80); 1129 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1130 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1131 } else { 1132 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1133 if(subchar>=0) { 1134 ch=subchar; 1135 ++numSubstitutions; 1136 } else { 1137 *pErrorCode = U_INVALID_CHAR_FOUND; 1138 return NULL; 1139 } 1140 1141 /* convert and append*/ 1142 pDest=_appendUTF8(pDest, ch); 1143 } 1144 } 1145 } while(--count > 0); 1146 } 1147 1148 while(pSrc<pSrcLimit) { 1149 ch=*pSrc++; 1150 if(ch <= 0x7f) { 1151 if(pDest<pDestLimit) { 1152 *pDest++ = (char)ch; 1153 } else { 1154 reqLength = 1; 1155 break; 1156 } 1157 } else if(ch <= 0x7ff) { 1158 if((pDestLimit - pDest) >= 2) { 1159 *pDest++=(uint8_t)((ch>>6)|0xc0); 1160 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1161 } else { 1162 reqLength = 2; 1163 break; 1164 } 1165 } else if(ch <= 0xd7ff || ch >= 0xe000) { 1166 if((pDestLimit - pDest) >= 3) { 1167 *pDest++=(uint8_t)((ch>>12)|0xe0); 1168 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1169 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1170 } else { 1171 reqLength = 3; 1172 break; 1173 } 1174 } else /* ch is a surrogate */ { 1175 int32_t length; 1176 1177 if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) { 1178 ++pSrc; 1179 ch=UTF16_GET_PAIR_VALUE(ch, ch2); 1180 } else if(subchar>=0) { 1181 ch=subchar; 1182 ++numSubstitutions; 1183 } else { 1184 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1185 *pErrorCode = U_INVALID_CHAR_FOUND; 1186 return NULL; 1187 } 1188 1189 length = U8_LENGTH(ch); 1190 if((pDestLimit - pDest) >= length) { 1191 /* convert and append*/ 1192 pDest=_appendUTF8(pDest, ch); 1193 } else { 1194 reqLength = length; 1195 break; 1196 } 1197 } 1198 } 1199 while(pSrc<pSrcLimit) { 1200 ch=*pSrc++; 1201 if(ch<=0x7f) { 1202 ++reqLength; 1203 } else if(ch<=0x7ff) { 1204 reqLength+=2; 1205 } else if(!UTF_IS_SURROGATE(ch)) { 1206 reqLength+=3; 1207 } else if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) { 1208 ++pSrc; 1209 reqLength+=4; 1210 } else if(subchar>=0) { 1211 reqLength+=U8_LENGTH(subchar); 1212 ++numSubstitutions; 1213 } else { 1214 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1215 *pErrorCode = U_INVALID_CHAR_FOUND; 1216 return NULL; 1217 } 1218 } 1219 } 1220 1221 reqLength+=(int32_t)(pDest - (uint8_t *)dest); 1222 1223 if(pNumSubstitutions!=NULL) { 1224 *pNumSubstitutions=numSubstitutions; 1225 } 1226 1227 if(pDestLength){ 1228 *pDestLength = reqLength; 1229 } 1230 1231 /* Terminate the buffer */ 1232 u_terminateChars((char*)dest,destCapacity,reqLength,pErrorCode); 1233 1234 return (char*)dest; 1235 } 1236 1237 U_CAPI char* U_EXPORT2 1238 u_strToUTF8(char *dest, 1239 int32_t destCapacity, 1240 int32_t *pDestLength, 1241 const UChar *pSrc, 1242 int32_t srcLength, 1243 UErrorCode *pErrorCode){ 1244 return u_strToUTF8WithSub( 1245 dest, destCapacity, pDestLength, 1246 pSrc, srcLength, 1247 U_SENTINEL, NULL, 1248 pErrorCode); 1249 } 1250