1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2001-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * 9 * File ustrtrns.c 10 * 11 * Modification History: 12 * 13 * Date Name Description 14 * 9/10/2001 Ram Creation. 15 ****************************************************************************** 16 */ 17 18 /******************************************************************************* 19 * 20 * u_strTo* and u_strFrom* APIs 21 * WCS functions moved to ustr_wcs.c for better modularization 22 * 23 ******************************************************************************* 24 */ 25 26 27 #include "unicode/putil.h" 28 #include "unicode/ustring.h" 29 #include "cstring.h" 30 #include "cmemory.h" 31 #include "ustr_imp.h" 32 33 U_CAPI UChar* U_EXPORT2 34 u_strFromUTF32WithSub(UChar *dest, 35 int32_t destCapacity, 36 int32_t *pDestLength, 37 const UChar32 *src, 38 int32_t srcLength, 39 UChar32 subchar, int32_t *pNumSubstitutions, 40 UErrorCode *pErrorCode) { 41 const UChar32 *srcLimit; 42 UChar32 ch; 43 UChar *destLimit; 44 UChar *pDest; 45 int32_t reqLength; 46 int32_t numSubstitutions; 47 48 /* args check */ 49 if(U_FAILURE(*pErrorCode)){ 50 return NULL; 51 } 52 if( (src==NULL && srcLength!=0) || srcLength < -1 || 53 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 54 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 55 ) { 56 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 57 return NULL; 58 } 59 60 if(pNumSubstitutions != NULL) { 61 *pNumSubstitutions = 0; 62 } 63 64 pDest = dest; 65 destLimit = dest + destCapacity; 66 reqLength = 0; 67 numSubstitutions = 0; 68 69 if(srcLength < 0) { 70 /* simple loop for conversion of a NUL-terminated BMP string */ 71 while((ch=*src) != 0 && 72 ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) { 73 ++src; 74 if(pDest < destLimit) { 75 *pDest++ = (UChar)ch; 76 } else { 77 ++reqLength; 78 } 79 } 80 srcLimit = src; 81 if(ch != 0) { 82 /* "complicated" case, find the end of the remaining string */ 83 while(*++srcLimit != 0) {} 84 } 85 } else { 86 srcLimit = src + srcLength; 87 } 88 89 /* convert with length */ 90 while(src < srcLimit) { 91 ch = *src++; 92 do { 93 /* usually "loops" once; twice only for writing subchar */ 94 if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) { 95 if(pDest < destLimit) { 96 *pDest++ = (UChar)ch; 97 } else { 98 ++reqLength; 99 } 100 break; 101 } else if(0x10000 <= ch && ch <= 0x10ffff) { 102 if((pDest + 2) <= destLimit) { 103 *pDest++ = U16_LEAD(ch); 104 *pDest++ = U16_TRAIL(ch); 105 } else { 106 reqLength += 2; 107 } 108 break; 109 } else if((ch = subchar) < 0) { 110 /* surrogate code point, or not a Unicode code point at all */ 111 *pErrorCode = U_INVALID_CHAR_FOUND; 112 return NULL; 113 } else { 114 ++numSubstitutions; 115 } 116 } while(TRUE); 117 } 118 119 reqLength += (int32_t)(pDest - dest); 120 if(pDestLength) { 121 *pDestLength = reqLength; 122 } 123 if(pNumSubstitutions != NULL) { 124 *pNumSubstitutions = numSubstitutions; 125 } 126 127 /* Terminate the buffer */ 128 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 129 130 return dest; 131 } 132 133 U_CAPI UChar* U_EXPORT2 134 u_strFromUTF32(UChar *dest, 135 int32_t destCapacity, 136 int32_t *pDestLength, 137 const UChar32 *src, 138 int32_t srcLength, 139 UErrorCode *pErrorCode) { 140 return u_strFromUTF32WithSub( 141 dest, destCapacity, pDestLength, 142 src, srcLength, 143 U_SENTINEL, NULL, 144 pErrorCode); 145 } 146 147 U_CAPI UChar32* U_EXPORT2 148 u_strToUTF32WithSub(UChar32 *dest, 149 int32_t destCapacity, 150 int32_t *pDestLength, 151 const UChar *src, 152 int32_t srcLength, 153 UChar32 subchar, int32_t *pNumSubstitutions, 154 UErrorCode *pErrorCode) { 155 const UChar *srcLimit; 156 UChar32 ch; 157 UChar ch2; 158 UChar32 *destLimit; 159 UChar32 *pDest; 160 int32_t reqLength; 161 int32_t numSubstitutions; 162 163 /* args check */ 164 if(U_FAILURE(*pErrorCode)){ 165 return NULL; 166 } 167 if( (src==NULL && srcLength!=0) || srcLength < -1 || 168 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 169 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 170 ) { 171 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 172 return NULL; 173 } 174 175 if(pNumSubstitutions != NULL) { 176 *pNumSubstitutions = 0; 177 } 178 179 pDest = dest; 180 destLimit = dest + destCapacity; 181 reqLength = 0; 182 numSubstitutions = 0; 183 184 if(srcLength < 0) { 185 /* simple loop for conversion of a NUL-terminated BMP string */ 186 while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) { 187 ++src; 188 if(pDest < destLimit) { 189 *pDest++ = ch; 190 } else { 191 ++reqLength; 192 } 193 } 194 srcLimit = src; 195 if(ch != 0) { 196 /* "complicated" case, find the end of the remaining string */ 197 while(*++srcLimit != 0) {} 198 } 199 } else { 200 srcLimit = src + srcLength; 201 } 202 203 /* convert with length */ 204 while(src < srcLimit) { 205 ch = *src++; 206 if(!U16_IS_SURROGATE(ch)) { 207 /* write or count ch below */ 208 } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) { 209 ++src; 210 ch = U16_GET_SUPPLEMENTARY(ch, ch2); 211 } else if((ch = subchar) < 0) { 212 /* unpaired surrogate */ 213 *pErrorCode = U_INVALID_CHAR_FOUND; 214 return NULL; 215 } else { 216 ++numSubstitutions; 217 } 218 if(pDest < destLimit) { 219 *pDest++ = ch; 220 } else { 221 ++reqLength; 222 } 223 } 224 225 reqLength += (int32_t)(pDest - dest); 226 if(pDestLength) { 227 *pDestLength = reqLength; 228 } 229 if(pNumSubstitutions != NULL) { 230 *pNumSubstitutions = numSubstitutions; 231 } 232 233 /* Terminate the buffer */ 234 u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode); 235 236 return dest; 237 } 238 239 U_CAPI UChar32* U_EXPORT2 240 u_strToUTF32(UChar32 *dest, 241 int32_t destCapacity, 242 int32_t *pDestLength, 243 const UChar *src, 244 int32_t srcLength, 245 UErrorCode *pErrorCode) { 246 return u_strToUTF32WithSub( 247 dest, destCapacity, pDestLength, 248 src, srcLength, 249 U_SENTINEL, NULL, 250 pErrorCode); 251 } 252 253 /* for utf8_nextCharSafeBodyTerminated() */ 254 static const UChar32 255 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; 256 257 /* 258 * Version of utf8_nextCharSafeBody() with the following differences: 259 * - checks for NUL termination instead of length 260 * - works with pointers instead of indexes 261 * - always strict (strict==-1) 262 * 263 * *ps points to after the lead byte and will be moved to after the last trail byte. 264 * c is the lead byte. 265 * @return the code point, or U_SENTINEL 266 */ 267 static UChar32 268 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) { 269 const uint8_t *s=*ps; 270 uint8_t trail, illegal=0; 271 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c); 272 UTF8_MASK_LEAD_BYTE((c), count); 273 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ 274 switch(count) { 275 /* each branch falls through to the next one */ 276 case 5: 277 case 4: 278 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ 279 illegal=1; 280 break; 281 case 3: 282 trail=(uint8_t)(*s++ - 0x80); 283 c=(c<<6)|trail; 284 if(trail>0x3f || c>=0x110) { 285 /* not a trail byte, or code point>0x10ffff (outside Unicode) */ 286 illegal=1; 287 break; 288 } 289 case 2: 290 trail=(uint8_t)(*s++ - 0x80); 291 if(trail>0x3f) { 292 /* not a trail byte */ 293 illegal=1; 294 break; 295 } 296 c=(c<<6)|trail; 297 case 1: 298 trail=(uint8_t)(*s++ - 0x80); 299 if(trail>0x3f) { 300 /* not a trail byte */ 301 illegal=1; 302 } 303 c=(c<<6)|trail; 304 break; 305 case 0: 306 return U_SENTINEL; 307 /* no default branch to optimize switch() - all values are covered */ 308 } 309 310 /* correct sequence - all trail bytes have (b7..b6)==(10)? */ 311 /* illegal is also set if count>=4 */ 312 if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) { 313 /* error handling */ 314 /* don't go beyond this sequence */ 315 s=*ps; 316 while(count>0 && UTF8_IS_TRAIL(*s)) { 317 ++s; 318 --count; 319 } 320 c=U_SENTINEL; 321 } 322 *ps=s; 323 return c; 324 } 325 326 /* 327 * Version of utf8_nextCharSafeBody() with the following differences: 328 * - works with pointers instead of indexes 329 * - always strict (strict==-1) 330 * 331 * *ps points to after the lead byte and will be moved to after the last trail byte. 332 * c is the lead byte. 333 * @return the code point, or U_SENTINEL 334 */ 335 static UChar32 336 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) { 337 const uint8_t *s=*ps; 338 uint8_t trail, illegal=0; 339 uint8_t count=UTF8_COUNT_TRAIL_BYTES(c); 340 if((limit-s)>=count) { 341 UTF8_MASK_LEAD_BYTE((c), count); 342 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ 343 switch(count) { 344 /* each branch falls through to the next one */ 345 case 5: 346 case 4: 347 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ 348 illegal=1; 349 break; 350 case 3: 351 trail=*s++; 352 c=(c<<6)|(trail&0x3f); 353 if(c<0x110) { 354 illegal|=(trail&0xc0)^0x80; 355 } else { 356 /* code point>0x10ffff, outside Unicode */ 357 illegal=1; 358 break; 359 } 360 case 2: 361 trail=*s++; 362 c=(c<<6)|(trail&0x3f); 363 illegal|=(trail&0xc0)^0x80; 364 case 1: 365 trail=*s++; 366 c=(c<<6)|(trail&0x3f); 367 illegal|=(trail&0xc0)^0x80; 368 break; 369 case 0: 370 return U_SENTINEL; 371 /* no default branch to optimize switch() - all values are covered */ 372 } 373 } else { 374 illegal=1; /* too few bytes left */ 375 } 376 377 /* correct sequence - all trail bytes have (b7..b6)==(10)? */ 378 /* illegal is also set if count>=4 */ 379 if(illegal || c<utf8_minLegal[count] || UTF_IS_SURROGATE(c)) { 380 /* error handling */ 381 /* don't go beyond this sequence */ 382 s=*ps; 383 while(count>0 && s<limit && UTF8_IS_TRAIL(*s)) { 384 ++s; 385 --count; 386 } 387 c=U_SENTINEL; 388 } 389 *ps=s; 390 return c; 391 } 392 393 U_CAPI UChar* U_EXPORT2 394 u_strFromUTF8WithSub(UChar *dest, 395 int32_t destCapacity, 396 int32_t *pDestLength, 397 const char* src, 398 int32_t srcLength, 399 UChar32 subchar, int32_t *pNumSubstitutions, 400 UErrorCode *pErrorCode){ 401 UChar *pDest = dest; 402 UChar *pDestLimit = dest+destCapacity; 403 UChar32 ch; 404 int32_t reqLength = 0; 405 const uint8_t* pSrc = (const uint8_t*) src; 406 uint8_t t1, t2; /* trail bytes */ 407 int32_t numSubstitutions; 408 409 /* args check */ 410 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 411 return NULL; 412 } 413 414 if( (src==NULL && srcLength!=0) || srcLength < -1 || 415 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 416 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 417 ) { 418 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 419 return NULL; 420 } 421 422 if(pNumSubstitutions!=NULL) { 423 *pNumSubstitutions=0; 424 } 425 numSubstitutions=0; 426 427 /* 428 * Inline processing of UTF-8 byte sequences: 429 * 430 * Byte sequences for the most common characters are handled inline in 431 * the conversion loops. In order to reduce the path lengths for those 432 * characters, the tests are arranged in a kind of binary search. 433 * ASCII (<=0x7f) is checked first, followed by the dividing point 434 * between 2- and 3-byte sequences (0xe0). 435 * The 3-byte branch is tested first to speed up CJK text. 436 * The compiler should combine the subtractions for the two tests for 0xe0. 437 * Each branch then tests for the other end of its range. 438 */ 439 440 if(srcLength < 0){ 441 /* 442 * Transform a NUL-terminated string. 443 * The code explicitly checks for NULs only in the lead byte position. 444 * A NUL byte in the trail byte position fails the trail byte range check anyway. 445 */ 446 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { 447 if(ch <= 0x7f){ 448 *pDest++=(UChar)ch; 449 ++pSrc; 450 } else { 451 if(ch > 0xe0) { 452 if( /* handle U+1000..U+CFFF inline */ 453 ch <= 0xec && 454 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 455 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 456 ) { 457 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 458 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 459 pSrc += 3; 460 continue; 461 } 462 } else if(ch < 0xe0) { 463 if( /* handle U+0080..U+07FF inline */ 464 ch >= 0xc2 && 465 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 466 ) { 467 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 468 pSrc += 2; 469 continue; 470 } 471 } 472 473 /* function call for "complicated" and error cases */ 474 ++pSrc; /* continue after the lead byte */ 475 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); 476 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { 477 *pErrorCode = U_INVALID_CHAR_FOUND; 478 return NULL; 479 } else if(ch<=0xFFFF) { 480 *(pDest++)=(UChar)ch; 481 } else { 482 *(pDest++)=UTF16_LEAD(ch); 483 if(pDest<pDestLimit) { 484 *(pDest++)=UTF16_TRAIL(ch); 485 } else { 486 reqLength++; 487 break; 488 } 489 } 490 } 491 } 492 493 /* Pre-flight the rest of the string. */ 494 while((ch = *pSrc) != 0) { 495 if(ch <= 0x7f){ 496 ++reqLength; 497 ++pSrc; 498 } else { 499 if(ch > 0xe0) { 500 if( /* handle U+1000..U+CFFF inline */ 501 ch <= 0xec && 502 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 503 (uint8_t)(pSrc[2] - 0x80) <= 0x3f 504 ) { 505 ++reqLength; 506 pSrc += 3; 507 continue; 508 } 509 } else if(ch < 0xe0) { 510 if( /* handle U+0080..U+07FF inline */ 511 ch >= 0xc2 && 512 (uint8_t)(pSrc[1] - 0x80) <= 0x3f 513 ) { 514 ++reqLength; 515 pSrc += 2; 516 continue; 517 } 518 } 519 520 /* function call for "complicated" and error cases */ 521 ++pSrc; /* continue after the lead byte */ 522 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); 523 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { 524 *pErrorCode = U_INVALID_CHAR_FOUND; 525 return NULL; 526 } 527 reqLength += U16_LENGTH(ch); 528 } 529 } 530 } else /* srcLength >= 0 */ { 531 const uint8_t *pSrcLimit = pSrc + srcLength; 532 int32_t count; 533 534 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 535 for(;;) { 536 /* 537 * Each iteration of the inner loop progresses by at most 3 UTF-8 538 * bytes and one UChar, for most characters. 539 * For supplementary code points (4 & 2), which are rare, 540 * there is an additional adjustment. 541 */ 542 count = (int32_t)(pDestLimit - pDest); 543 srcLength = (int32_t)((pSrcLimit - pSrc) / 3); 544 if(count > srcLength) { 545 count = srcLength; /* min(remaining dest, remaining src/3) */ 546 } 547 if(count < 3) { 548 /* 549 * Too much overhead if we get near the end of the string, 550 * continue with the next loop. 551 */ 552 break; 553 } 554 555 do { 556 ch = *pSrc; 557 if(ch <= 0x7f){ 558 *pDest++=(UChar)ch; 559 ++pSrc; 560 } else { 561 if(ch > 0xe0) { 562 if( /* handle U+1000..U+CFFF inline */ 563 ch <= 0xec && 564 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 565 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 566 ) { 567 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 568 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 569 pSrc += 3; 570 continue; 571 } 572 } else if(ch < 0xe0) { 573 if( /* handle U+0080..U+07FF inline */ 574 ch >= 0xc2 && 575 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 576 ) { 577 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 578 pSrc += 2; 579 continue; 580 } 581 } 582 583 if(ch >= 0xf0 || subchar > 0xffff) { 584 /* 585 * We may read up to six bytes and write up to two UChars, 586 * which we didn't account for with computing count, 587 * so we adjust it here. 588 */ 589 if(--count == 0) { 590 break; 591 } 592 } 593 594 /* function call for "complicated" and error cases */ 595 ++pSrc; /* continue after the lead byte */ 596 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 597 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 598 *pErrorCode = U_INVALID_CHAR_FOUND; 599 return NULL; 600 }else if(ch<=0xFFFF){ 601 *(pDest++)=(UChar)ch; 602 }else{ 603 *(pDest++)=UTF16_LEAD(ch); 604 *(pDest++)=UTF16_TRAIL(ch); 605 } 606 } 607 } while(--count > 0); 608 } 609 610 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { 611 ch = *pSrc; 612 if(ch <= 0x7f){ 613 *pDest++=(UChar)ch; 614 ++pSrc; 615 } else { 616 if(ch > 0xe0) { 617 if( /* handle U+1000..U+CFFF inline */ 618 ch <= 0xec && 619 ((pSrcLimit - pSrc) >= 3) && 620 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 621 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 622 ) { 623 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 624 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 625 pSrc += 3; 626 continue; 627 } 628 } else if(ch < 0xe0) { 629 if( /* handle U+0080..U+07FF inline */ 630 ch >= 0xc2 && 631 ((pSrcLimit - pSrc) >= 2) && 632 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 633 ) { 634 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 635 pSrc += 2; 636 continue; 637 } 638 } 639 640 /* function call for "complicated" and error cases */ 641 ++pSrc; /* continue after the lead byte */ 642 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 643 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 644 *pErrorCode = U_INVALID_CHAR_FOUND; 645 return NULL; 646 }else if(ch<=0xFFFF){ 647 *(pDest++)=(UChar)ch; 648 }else{ 649 *(pDest++)=UTF16_LEAD(ch); 650 if(pDest<pDestLimit){ 651 *(pDest++)=UTF16_TRAIL(ch); 652 }else{ 653 reqLength++; 654 break; 655 } 656 } 657 } 658 } 659 /* do not fill the dest buffer just count the UChars needed */ 660 while(pSrc < pSrcLimit){ 661 ch = *pSrc; 662 if(ch <= 0x7f){ 663 reqLength++; 664 ++pSrc; 665 } else { 666 if(ch > 0xe0) { 667 if( /* handle U+1000..U+CFFF inline */ 668 ch <= 0xec && 669 ((pSrcLimit - pSrc) >= 3) && 670 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 671 (uint8_t)(pSrc[2] - 0x80) <= 0x3f 672 ) { 673 reqLength++; 674 pSrc += 3; 675 continue; 676 } 677 } else if(ch < 0xe0) { 678 if( /* handle U+0080..U+07FF inline */ 679 ch >= 0xc2 && 680 ((pSrcLimit - pSrc) >= 2) && 681 (uint8_t)(pSrc[1] - 0x80) <= 0x3f 682 ) { 683 reqLength++; 684 pSrc += 2; 685 continue; 686 } 687 } 688 689 /* function call for "complicated" and error cases */ 690 ++pSrc; /* continue after the lead byte */ 691 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 692 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 693 *pErrorCode = U_INVALID_CHAR_FOUND; 694 return NULL; 695 } 696 reqLength+=UTF_CHAR_LENGTH(ch); 697 } 698 } 699 } 700 701 reqLength+=(int32_t)(pDest - dest); 702 703 if(pNumSubstitutions!=NULL) { 704 *pNumSubstitutions=numSubstitutions; 705 } 706 707 if(pDestLength){ 708 *pDestLength = reqLength; 709 } 710 711 /* Terminate the buffer */ 712 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 713 714 return dest; 715 } 716 717 U_CAPI UChar* U_EXPORT2 718 u_strFromUTF8(UChar *dest, 719 int32_t destCapacity, 720 int32_t *pDestLength, 721 const char* src, 722 int32_t srcLength, 723 UErrorCode *pErrorCode){ 724 return u_strFromUTF8WithSub( 725 dest, destCapacity, pDestLength, 726 src, srcLength, 727 U_SENTINEL, NULL, 728 pErrorCode); 729 } 730 731 U_CAPI UChar * U_EXPORT2 732 u_strFromUTF8Lenient(UChar *dest, 733 int32_t destCapacity, 734 int32_t *pDestLength, 735 const char *src, 736 int32_t srcLength, 737 UErrorCode *pErrorCode) { 738 UChar *pDest = dest; 739 UChar32 ch; 740 int32_t reqLength = 0; 741 uint8_t* pSrc = (uint8_t*) src; 742 743 /* args check */ 744 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 745 return NULL; 746 } 747 748 if( (src==NULL && srcLength!=0) || srcLength < -1 || 749 (destCapacity<0) || (dest == NULL && destCapacity > 0) 750 ) { 751 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 752 return NULL; 753 } 754 755 if(srcLength < 0) { 756 /* Transform a NUL-terminated string. */ 757 UChar *pDestLimit = dest+destCapacity; 758 uint8_t t1, t2, t3; /* trail bytes */ 759 760 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { 761 if(ch < 0xc0) { 762 /* 763 * ASCII, or a trail byte in lead position which is treated like 764 * a single-byte sequence for better character boundary 765 * resynchronization after illegal sequences. 766 */ 767 *pDest++=(UChar)ch; 768 ++pSrc; 769 continue; 770 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 771 if((t1 = pSrc[1]) != 0) { 772 /* 0x3080 = (0xc0 << 6) + 0x80 */ 773 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080); 774 pSrc += 2; 775 continue; 776 } 777 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 778 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) { 779 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 780 /* 0x2080 = (0x80 << 6) + 0x80 */ 781 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080); 782 pSrc += 3; 783 continue; 784 } 785 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 786 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) { 787 pSrc += 4; 788 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 789 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080; 790 *(pDest++) = U16_LEAD(ch); 791 if(pDest < pDestLimit) { 792 *(pDest++) = U16_TRAIL(ch); 793 } else { 794 reqLength = 1; 795 break; 796 } 797 continue; 798 } 799 } 800 801 /* truncated character at the end */ 802 *pDest++ = 0xfffd; 803 while(*++pSrc != 0) {} 804 break; 805 } 806 807 /* Pre-flight the rest of the string. */ 808 while((ch = *pSrc) != 0) { 809 if(ch < 0xc0) { 810 /* 811 * ASCII, or a trail byte in lead position which is treated like 812 * a single-byte sequence for better character boundary 813 * resynchronization after illegal sequences. 814 */ 815 ++reqLength; 816 ++pSrc; 817 continue; 818 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 819 if(pSrc[1] != 0) { 820 ++reqLength; 821 pSrc += 2; 822 continue; 823 } 824 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 825 if(pSrc[1] != 0 && pSrc[2] != 0) { 826 ++reqLength; 827 pSrc += 3; 828 continue; 829 } 830 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 831 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) { 832 reqLength += 2; 833 pSrc += 4; 834 continue; 835 } 836 } 837 838 /* truncated character at the end */ 839 ++reqLength; 840 break; 841 } 842 } else /* srcLength >= 0 */ { 843 const uint8_t *pSrcLimit = pSrc + srcLength; 844 845 /* 846 * This function requires that if srcLength is given, then it must be 847 * destCapatity >= srcLength so that we need not check for 848 * destination buffer overflow in the loop. 849 */ 850 if(destCapacity < srcLength) { 851 if(pDestLength != NULL) { 852 *pDestLength = srcLength; /* this likely overestimates the true destLength! */ 853 } 854 *pErrorCode = U_BUFFER_OVERFLOW_ERROR; 855 return NULL; 856 } 857 858 if((pSrcLimit - pSrc) >= 4) { 859 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */ 860 861 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */ 862 do { 863 ch = *pSrc++; 864 if(ch < 0xc0) { 865 /* 866 * ASCII, or a trail byte in lead position which is treated like 867 * a single-byte sequence for better character boundary 868 * resynchronization after illegal sequences. 869 */ 870 *pDest++=(UChar)ch; 871 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 872 /* 0x3080 = (0xc0 << 6) + 0x80 */ 873 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); 874 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 875 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 876 /* 0x2080 = (0x80 << 6) + 0x80 */ 877 ch = (ch << 12) + (*pSrc++ << 6); 878 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); 879 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 880 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 881 ch = (ch << 18) + (*pSrc++ << 12); 882 ch += *pSrc++ << 6; 883 ch += *pSrc++ - 0x3c82080; 884 *(pDest++) = U16_LEAD(ch); 885 *(pDest++) = U16_TRAIL(ch); 886 } 887 } while(pSrc < pSrcLimit); 888 889 pSrcLimit += 3; /* restore original pSrcLimit */ 890 } 891 892 while(pSrc < pSrcLimit) { 893 ch = *pSrc++; 894 if(ch < 0xc0) { 895 /* 896 * ASCII, or a trail byte in lead position which is treated like 897 * a single-byte sequence for better character boundary 898 * resynchronization after illegal sequences. 899 */ 900 *pDest++=(UChar)ch; 901 continue; 902 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 903 if(pSrc < pSrcLimit) { 904 /* 0x3080 = (0xc0 << 6) + 0x80 */ 905 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); 906 continue; 907 } 908 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 909 if((pSrcLimit - pSrc) >= 2) { 910 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 911 /* 0x2080 = (0x80 << 6) + 0x80 */ 912 ch = (ch << 12) + (*pSrc++ << 6); 913 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); 914 pSrc += 3; 915 continue; 916 } 917 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 918 if((pSrcLimit - pSrc) >= 3) { 919 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 920 ch = (ch << 18) + (*pSrc++ << 12); 921 ch += *pSrc++ << 6; 922 ch += *pSrc++ - 0x3c82080; 923 *(pDest++) = U16_LEAD(ch); 924 *(pDest++) = U16_TRAIL(ch); 925 pSrc += 4; 926 continue; 927 } 928 } 929 930 /* truncated character at the end */ 931 *pDest++ = 0xfffd; 932 break; 933 } 934 } 935 936 reqLength+=(int32_t)(pDest - dest); 937 938 if(pDestLength){ 939 *pDestLength = reqLength; 940 } 941 942 /* Terminate the buffer */ 943 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 944 945 return dest; 946 } 947 948 static U_INLINE uint8_t * 949 _appendUTF8(uint8_t *pDest, UChar32 c) { 950 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */ 951 if((c)<=0x7f) { 952 *pDest++=(uint8_t)c; 953 } else if(c<=0x7ff) { 954 *pDest++=(uint8_t)((c>>6)|0xc0); 955 *pDest++=(uint8_t)((c&0x3f)|0x80); 956 } else if(c<=0xffff) { 957 *pDest++=(uint8_t)((c>>12)|0xe0); 958 *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80); 959 *pDest++=(uint8_t)(((c)&0x3f)|0x80); 960 } else /* if((uint32_t)(c)<=0x10ffff) */ { 961 *pDest++=(uint8_t)(((c)>>18)|0xf0); 962 *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80); 963 *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80); 964 *pDest++=(uint8_t)(((c)&0x3f)|0x80); 965 } 966 return pDest; 967 } 968 969 970 U_CAPI char* U_EXPORT2 971 u_strToUTF8WithSub(char *dest, 972 int32_t destCapacity, 973 int32_t *pDestLength, 974 const UChar *pSrc, 975 int32_t srcLength, 976 UChar32 subchar, int32_t *pNumSubstitutions, 977 UErrorCode *pErrorCode){ 978 int32_t reqLength=0; 979 uint32_t ch=0,ch2=0; 980 uint8_t *pDest = (uint8_t *)dest; 981 uint8_t *pDestLimit = pDest + destCapacity; 982 int32_t numSubstitutions; 983 984 /* args check */ 985 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 986 return NULL; 987 } 988 989 if( (pSrc==NULL && srcLength!=0) || srcLength < -1 || 990 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 991 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 992 ) { 993 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 994 return NULL; 995 } 996 997 if(pNumSubstitutions!=NULL) { 998 *pNumSubstitutions=0; 999 } 1000 numSubstitutions=0; 1001 1002 if(srcLength==-1) { 1003 while((ch=*pSrc)!=0) { 1004 ++pSrc; 1005 if(ch <= 0x7f) { 1006 if(pDest<pDestLimit) { 1007 *pDest++ = (uint8_t)ch; 1008 } else { 1009 reqLength = 1; 1010 break; 1011 } 1012 } else if(ch <= 0x7ff) { 1013 if((pDestLimit - pDest) >= 2) { 1014 *pDest++=(uint8_t)((ch>>6)|0xc0); 1015 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1016 } else { 1017 reqLength = 2; 1018 break; 1019 } 1020 } else if(ch <= 0xd7ff || ch >= 0xe000) { 1021 if((pDestLimit - pDest) >= 3) { 1022 *pDest++=(uint8_t)((ch>>12)|0xe0); 1023 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1024 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1025 } else { 1026 reqLength = 3; 1027 break; 1028 } 1029 } else /* ch is a surrogate */ { 1030 int32_t length; 1031 1032 /*need not check for NUL because NUL fails UTF_IS_TRAIL() anyway*/ 1033 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { 1034 ++pSrc; 1035 ch=UTF16_GET_PAIR_VALUE(ch, ch2); 1036 } else if(subchar>=0) { 1037 ch=subchar; 1038 ++numSubstitutions; 1039 } else { 1040 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1041 *pErrorCode = U_INVALID_CHAR_FOUND; 1042 return NULL; 1043 } 1044 1045 length = U8_LENGTH(ch); 1046 if((pDestLimit - pDest) >= length) { 1047 /* convert and append*/ 1048 pDest=_appendUTF8(pDest, ch); 1049 } else { 1050 reqLength = length; 1051 break; 1052 } 1053 } 1054 } 1055 while((ch=*pSrc++)!=0) { 1056 if(ch<=0x7f) { 1057 ++reqLength; 1058 } else if(ch<=0x7ff) { 1059 reqLength+=2; 1060 } else if(!UTF_IS_SURROGATE(ch)) { 1061 reqLength+=3; 1062 } else if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { 1063 ++pSrc; 1064 reqLength+=4; 1065 } else if(subchar>=0) { 1066 reqLength+=U8_LENGTH(subchar); 1067 ++numSubstitutions; 1068 } else { 1069 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1070 *pErrorCode = U_INVALID_CHAR_FOUND; 1071 return NULL; 1072 } 1073 } 1074 } else { 1075 const UChar *pSrcLimit = pSrc+srcLength; 1076 int32_t count; 1077 1078 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1079 for(;;) { 1080 /* 1081 * Each iteration of the inner loop progresses by at most 3 UTF-8 1082 * bytes and one UChar, for most characters. 1083 * For supplementary code points (4 & 2), which are rare, 1084 * there is an additional adjustment. 1085 */ 1086 count = (int32_t)((pDestLimit - pDest) / 3); 1087 srcLength = (int32_t)(pSrcLimit - pSrc); 1088 if(count > srcLength) { 1089 count = srcLength; /* min(remaining dest/3, remaining src) */ 1090 } 1091 if(count < 3) { 1092 /* 1093 * Too much overhead if we get near the end of the string, 1094 * continue with the next loop. 1095 */ 1096 break; 1097 } 1098 do { 1099 ch=*pSrc++; 1100 if(ch <= 0x7f) { 1101 *pDest++ = (uint8_t)ch; 1102 } else if(ch <= 0x7ff) { 1103 *pDest++=(uint8_t)((ch>>6)|0xc0); 1104 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1105 } else if(ch <= 0xd7ff || ch >= 0xe000) { 1106 *pDest++=(uint8_t)((ch>>12)|0xe0); 1107 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1108 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1109 } else /* ch is a surrogate */ { 1110 /* 1111 * We will read two UChars and probably output four bytes, 1112 * which we didn't account for with computing count, 1113 * so we adjust it here. 1114 */ 1115 if(--count == 0) { 1116 --pSrc; /* undo ch=*pSrc++ for the lead surrogate */ 1117 break; /* recompute count */ 1118 } 1119 1120 if(UTF_IS_SURROGATE_FIRST(ch) && UTF_IS_TRAIL(ch2=*pSrc)) { 1121 ++pSrc; 1122 ch=UTF16_GET_PAIR_VALUE(ch, ch2); 1123 1124 /* writing 4 bytes per 2 UChars is ok */ 1125 *pDest++=(uint8_t)((ch>>18)|0xf0); 1126 *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80); 1127 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1128 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1129 } else { 1130 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1131 if(subchar>=0) { 1132 ch=subchar; 1133 ++numSubstitutions; 1134 } else { 1135 *pErrorCode = U_INVALID_CHAR_FOUND; 1136 return NULL; 1137 } 1138 1139 /* convert and append*/ 1140 pDest=_appendUTF8(pDest, ch); 1141 } 1142 } 1143 } while(--count > 0); 1144 } 1145 1146 while(pSrc<pSrcLimit) { 1147 ch=*pSrc++; 1148 if(ch <= 0x7f) { 1149 if(pDest<pDestLimit) { 1150 *pDest++ = (uint8_t)ch; 1151 } else { 1152 reqLength = 1; 1153 break; 1154 } 1155 } else if(ch <= 0x7ff) { 1156 if((pDestLimit - pDest) >= 2) { 1157 *pDest++=(uint8_t)((ch>>6)|0xc0); 1158 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1159 } else { 1160 reqLength = 2; 1161 break; 1162 } 1163 } else if(ch <= 0xd7ff || ch >= 0xe000) { 1164 if((pDestLimit - pDest) >= 3) { 1165 *pDest++=(uint8_t)((ch>>12)|0xe0); 1166 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1167 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1168 } else { 1169 reqLength = 3; 1170 break; 1171 } 1172 } else /* ch is a surrogate */ { 1173 int32_t length; 1174 1175 if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) { 1176 ++pSrc; 1177 ch=UTF16_GET_PAIR_VALUE(ch, ch2); 1178 } else if(subchar>=0) { 1179 ch=subchar; 1180 ++numSubstitutions; 1181 } else { 1182 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1183 *pErrorCode = U_INVALID_CHAR_FOUND; 1184 return NULL; 1185 } 1186 1187 length = U8_LENGTH(ch); 1188 if((pDestLimit - pDest) >= length) { 1189 /* convert and append*/ 1190 pDest=_appendUTF8(pDest, ch); 1191 } else { 1192 reqLength = length; 1193 break; 1194 } 1195 } 1196 } 1197 while(pSrc<pSrcLimit) { 1198 ch=*pSrc++; 1199 if(ch<=0x7f) { 1200 ++reqLength; 1201 } else if(ch<=0x7ff) { 1202 reqLength+=2; 1203 } else if(!UTF_IS_SURROGATE(ch)) { 1204 reqLength+=3; 1205 } else if(UTF_IS_SURROGATE_FIRST(ch) && pSrc<pSrcLimit && UTF_IS_TRAIL(ch2=*pSrc)) { 1206 ++pSrc; 1207 reqLength+=4; 1208 } else if(subchar>=0) { 1209 reqLength+=U8_LENGTH(subchar); 1210 ++numSubstitutions; 1211 } else { 1212 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1213 *pErrorCode = U_INVALID_CHAR_FOUND; 1214 return NULL; 1215 } 1216 } 1217 } 1218 1219 reqLength+=(int32_t)(pDest - (uint8_t *)dest); 1220 1221 if(pNumSubstitutions!=NULL) { 1222 *pNumSubstitutions=numSubstitutions; 1223 } 1224 1225 if(pDestLength){ 1226 *pDestLength = reqLength; 1227 } 1228 1229 /* Terminate the buffer */ 1230 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1231 return dest; 1232 } 1233 1234 U_CAPI char* U_EXPORT2 1235 u_strToUTF8(char *dest, 1236 int32_t destCapacity, 1237 int32_t *pDestLength, 1238 const UChar *pSrc, 1239 int32_t srcLength, 1240 UErrorCode *pErrorCode){ 1241 return u_strToUTF8WithSub( 1242 dest, destCapacity, pDestLength, 1243 pSrc, srcLength, 1244 U_SENTINEL, NULL, 1245 pErrorCode); 1246 } 1247 1248 U_CAPI UChar* U_EXPORT2 1249 u_strFromJavaModifiedUTF8WithSub( 1250 UChar *dest, 1251 int32_t destCapacity, 1252 int32_t *pDestLength, 1253 const char *src, 1254 int32_t srcLength, 1255 UChar32 subchar, int32_t *pNumSubstitutions, 1256 UErrorCode *pErrorCode) { 1257 UChar *pDest = dest; 1258 UChar *pDestLimit = dest+destCapacity; 1259 UChar32 ch; 1260 int32_t reqLength = 0; 1261 const uint8_t* pSrc = (const uint8_t*) src; 1262 const uint8_t *pSrcLimit; 1263 int32_t count; 1264 uint8_t t1, t2; /* trail bytes */ 1265 int32_t numSubstitutions; 1266 1267 /* args check */ 1268 if(U_FAILURE(*pErrorCode)){ 1269 return NULL; 1270 } 1271 if( (src==NULL && srcLength!=0) || srcLength < -1 || 1272 (dest==NULL && destCapacity!=0) || destCapacity<0 || 1273 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 1274 ) { 1275 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1276 return NULL; 1277 } 1278 1279 if(pNumSubstitutions!=NULL) { 1280 *pNumSubstitutions=0; 1281 } 1282 numSubstitutions=0; 1283 1284 if(srcLength < 0) { 1285 /* 1286 * Transform a NUL-terminated ASCII string. 1287 * Handle non-ASCII strings with slower code. 1288 */ 1289 while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) { 1290 *pDest++=(UChar)ch; 1291 ++pSrc; 1292 } 1293 if(ch == 0) { 1294 reqLength=(int32_t)(pDest - dest); 1295 if(pDestLength) { 1296 *pDestLength = reqLength; 1297 } 1298 1299 /* Terminate the buffer */ 1300 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 1301 return dest; 1302 } 1303 srcLength = uprv_strlen((const char *)pSrc); 1304 } 1305 1306 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1307 pSrcLimit = pSrc + srcLength; 1308 for(;;) { 1309 count = (int32_t)(pDestLimit - pDest); 1310 srcLength = (int32_t)(pSrcLimit - pSrc); 1311 if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) { 1312 /* fast ASCII loop */ 1313 const uint8_t *prevSrc = pSrc; 1314 int32_t delta; 1315 while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) { 1316 *pDest++=(UChar)ch; 1317 ++pSrc; 1318 } 1319 delta = (int32_t)(pSrc - prevSrc); 1320 count -= delta; 1321 srcLength -= delta; 1322 } 1323 /* 1324 * Each iteration of the inner loop progresses by at most 3 UTF-8 1325 * bytes and one UChar. 1326 */ 1327 srcLength /= 3; 1328 if(count > srcLength) { 1329 count = srcLength; /* min(remaining dest, remaining src/3) */ 1330 } 1331 if(count < 3) { 1332 /* 1333 * Too much overhead if we get near the end of the string, 1334 * continue with the next loop. 1335 */ 1336 break; 1337 } 1338 do { 1339 ch = *pSrc; 1340 if(ch <= 0x7f){ 1341 *pDest++=(UChar)ch; 1342 ++pSrc; 1343 } else { 1344 if(ch >= 0xe0) { 1345 if( /* handle U+0000..U+FFFF inline */ 1346 ch <= 0xef && 1347 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 1348 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 1349 ) { 1350 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1351 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 1352 pSrc += 3; 1353 continue; 1354 } 1355 } else { 1356 if( /* handle U+0000..U+07FF inline */ 1357 ch >= 0xc0 && 1358 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 1359 ) { 1360 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 1361 pSrc += 2; 1362 continue; 1363 } 1364 } 1365 1366 if(subchar < 0) { 1367 *pErrorCode = U_INVALID_CHAR_FOUND; 1368 return NULL; 1369 } else if(subchar > 0xffff && --count == 0) { 1370 /* 1371 * We need to write two UChars, adjusted count for that, 1372 * and ran out of space. 1373 */ 1374 break; 1375 } else { 1376 /* function call for error cases */ 1377 ++pSrc; /* continue after the lead byte */ 1378 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1379 ++numSubstitutions; 1380 if(subchar<=0xFFFF) { 1381 *(pDest++)=(UChar)subchar; 1382 } else { 1383 *(pDest++)=U16_LEAD(subchar); 1384 *(pDest++)=U16_TRAIL(subchar); 1385 } 1386 } 1387 } 1388 } while(--count > 0); 1389 } 1390 1391 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { 1392 ch = *pSrc; 1393 if(ch <= 0x7f){ 1394 *pDest++=(UChar)ch; 1395 ++pSrc; 1396 } else { 1397 if(ch >= 0xe0) { 1398 if( /* handle U+0000..U+FFFF inline */ 1399 ch <= 0xef && 1400 ((pSrcLimit - pSrc) >= 3) && 1401 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 1402 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 1403 ) { 1404 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1405 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 1406 pSrc += 3; 1407 continue; 1408 } 1409 } else { 1410 if( /* handle U+0000..U+07FF inline */ 1411 ch >= 0xc0 && 1412 ((pSrcLimit - pSrc) >= 2) && 1413 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 1414 ) { 1415 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 1416 pSrc += 2; 1417 continue; 1418 } 1419 } 1420 1421 if(subchar < 0) { 1422 *pErrorCode = U_INVALID_CHAR_FOUND; 1423 return NULL; 1424 } else { 1425 /* function call for error cases */ 1426 ++pSrc; /* continue after the lead byte */ 1427 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1428 ++numSubstitutions; 1429 if(subchar<=0xFFFF) { 1430 *(pDest++)=(UChar)subchar; 1431 } else { 1432 *(pDest++)=U16_LEAD(subchar); 1433 if(pDest<pDestLimit) { 1434 *(pDest++)=U16_TRAIL(subchar); 1435 } else { 1436 reqLength++; 1437 break; 1438 } 1439 } 1440 } 1441 } 1442 } 1443 1444 /* do not fill the dest buffer just count the UChars needed */ 1445 while(pSrc < pSrcLimit){ 1446 ch = *pSrc; 1447 if(ch <= 0x7f) { 1448 reqLength++; 1449 ++pSrc; 1450 } else { 1451 if(ch >= 0xe0) { 1452 if( /* handle U+0000..U+FFFF inline */ 1453 ch <= 0xef && 1454 ((pSrcLimit - pSrc) >= 3) && 1455 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 1456 (uint8_t)(pSrc[2] - 0x80) <= 0x3f 1457 ) { 1458 reqLength++; 1459 pSrc += 3; 1460 continue; 1461 } 1462 } else { 1463 if( /* handle U+0000..U+07FF inline */ 1464 ch >= 0xc0 && 1465 ((pSrcLimit - pSrc) >= 2) && 1466 (uint8_t)(pSrc[1] - 0x80) <= 0x3f 1467 ) { 1468 reqLength++; 1469 pSrc += 2; 1470 continue; 1471 } 1472 } 1473 1474 if(subchar < 0) { 1475 *pErrorCode = U_INVALID_CHAR_FOUND; 1476 return NULL; 1477 } else { 1478 /* function call for error cases */ 1479 ++pSrc; /* continue after the lead byte */ 1480 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1481 ++numSubstitutions; 1482 reqLength+=U16_LENGTH(ch); 1483 } 1484 } 1485 } 1486 1487 if(pNumSubstitutions!=NULL) { 1488 *pNumSubstitutions=numSubstitutions; 1489 } 1490 1491 reqLength+=(int32_t)(pDest - dest); 1492 if(pDestLength) { 1493 *pDestLength = reqLength; 1494 } 1495 1496 /* Terminate the buffer */ 1497 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 1498 return dest; 1499 } 1500 1501 U_CAPI char* U_EXPORT2 1502 u_strToJavaModifiedUTF8( 1503 char *dest, 1504 int32_t destCapacity, 1505 int32_t *pDestLength, 1506 const UChar *src, 1507 int32_t srcLength, 1508 UErrorCode *pErrorCode) { 1509 int32_t reqLength=0; 1510 uint32_t ch=0; 1511 uint8_t *pDest = (uint8_t *)dest; 1512 uint8_t *pDestLimit = pDest + destCapacity; 1513 const UChar *pSrcLimit; 1514 int32_t count; 1515 1516 /* args check */ 1517 if(U_FAILURE(*pErrorCode)){ 1518 return NULL; 1519 } 1520 if( (src==NULL && srcLength!=0) || srcLength < -1 || 1521 (dest==NULL && destCapacity!=0) || destCapacity<0 1522 ) { 1523 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1524 return NULL; 1525 } 1526 1527 if(srcLength==-1) { 1528 /* Convert NUL-terminated ASCII, then find the string length. */ 1529 while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) { 1530 *pDest++ = (uint8_t)ch; 1531 ++src; 1532 } 1533 if(ch == 0) { 1534 reqLength=(int32_t)(pDest - (uint8_t *)dest); 1535 if(pDestLength) { 1536 *pDestLength = reqLength; 1537 } 1538 1539 /* Terminate the buffer */ 1540 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1541 return dest; 1542 } 1543 srcLength = u_strlen(src); 1544 } 1545 1546 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1547 pSrcLimit = src+srcLength; 1548 for(;;) { 1549 count = (int32_t)(pDestLimit - pDest); 1550 srcLength = (int32_t)(pSrcLimit - src); 1551 if(count >= srcLength && srcLength > 0 && *src <= 0x7f) { 1552 /* fast ASCII loop */ 1553 const UChar *prevSrc = src; 1554 int32_t delta; 1555 while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) { 1556 *pDest++=(uint8_t)ch; 1557 ++src; 1558 } 1559 delta = (int32_t)(src - prevSrc); 1560 count -= delta; 1561 srcLength -= delta; 1562 } 1563 /* 1564 * Each iteration of the inner loop progresses by at most 3 UTF-8 1565 * bytes and one UChar. 1566 */ 1567 count /= 3; 1568 if(count > srcLength) { 1569 count = srcLength; /* min(remaining dest/3, remaining src) */ 1570 } 1571 if(count < 3) { 1572 /* 1573 * Too much overhead if we get near the end of the string, 1574 * continue with the next loop. 1575 */ 1576 break; 1577 } 1578 do { 1579 ch=*src++; 1580 if(ch <= 0x7f && ch != 0) { 1581 *pDest++ = (uint8_t)ch; 1582 } else if(ch <= 0x7ff) { 1583 *pDest++=(uint8_t)((ch>>6)|0xc0); 1584 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1585 } else { 1586 *pDest++=(uint8_t)((ch>>12)|0xe0); 1587 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1588 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1589 } 1590 } while(--count > 0); 1591 } 1592 1593 while(src<pSrcLimit) { 1594 ch=*src++; 1595 if(ch <= 0x7f && ch != 0) { 1596 if(pDest<pDestLimit) { 1597 *pDest++ = (uint8_t)ch; 1598 } else { 1599 reqLength = 1; 1600 break; 1601 } 1602 } else if(ch <= 0x7ff) { 1603 if((pDestLimit - pDest) >= 2) { 1604 *pDest++=(uint8_t)((ch>>6)|0xc0); 1605 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1606 } else { 1607 reqLength = 2; 1608 break; 1609 } 1610 } else { 1611 if((pDestLimit - pDest) >= 3) { 1612 *pDest++=(uint8_t)((ch>>12)|0xe0); 1613 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1614 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1615 } else { 1616 reqLength = 3; 1617 break; 1618 } 1619 } 1620 } 1621 while(src<pSrcLimit) { 1622 ch=*src++; 1623 if(ch <= 0x7f && ch != 0) { 1624 ++reqLength; 1625 } else if(ch<=0x7ff) { 1626 reqLength+=2; 1627 } else { 1628 reqLength+=3; 1629 } 1630 } 1631 1632 reqLength+=(int32_t)(pDest - (uint8_t *)dest); 1633 if(pDestLength){ 1634 *pDestLength = reqLength; 1635 } 1636 1637 /* Terminate the buffer */ 1638 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1639 return dest; 1640 } 1641