1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2001-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * 11 * File ustrtrns.cpp 12 * 13 * Modification History: 14 * 15 * Date Name Description 16 * 9/10/2001 Ram Creation. 17 ****************************************************************************** 18 */ 19 20 /******************************************************************************* 21 * 22 * u_strTo* and u_strFrom* APIs 23 * WCS functions moved to ustr_wcs.c for better modularization 24 * 25 ******************************************************************************* 26 */ 27 28 29 #include "unicode/putil.h" 30 #include "unicode/ustring.h" 31 #include "unicode/utf.h" 32 #include "unicode/utf8.h" 33 #include "unicode/utf16.h" 34 #include "cstring.h" 35 #include "cmemory.h" 36 #include "ustr_imp.h" 37 #include "uassert.h" 38 39 U_CAPI UChar* U_EXPORT2 40 u_strFromUTF32WithSub(UChar *dest, 41 int32_t destCapacity, 42 int32_t *pDestLength, 43 const UChar32 *src, 44 int32_t srcLength, 45 UChar32 subchar, int32_t *pNumSubstitutions, 46 UErrorCode *pErrorCode) { 47 const UChar32 *srcLimit; 48 UChar32 ch; 49 UChar *destLimit; 50 UChar *pDest; 51 int32_t reqLength; 52 int32_t numSubstitutions; 53 54 /* args check */ 55 if(U_FAILURE(*pErrorCode)){ 56 return NULL; 57 } 58 if( (src==NULL && srcLength!=0) || srcLength < -1 || 59 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 60 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 61 ) { 62 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 63 return NULL; 64 } 65 66 if(pNumSubstitutions != NULL) { 67 *pNumSubstitutions = 0; 68 } 69 70 pDest = dest; 71 destLimit = (dest!=NULL)?(dest + destCapacity):NULL; 72 reqLength = 0; 73 numSubstitutions = 0; 74 75 if(srcLength < 0) { 76 /* simple loop for conversion of a NUL-terminated BMP string */ 77 while((ch=*src) != 0 && 78 ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) { 79 ++src; 80 if(pDest < destLimit) { 81 *pDest++ = (UChar)ch; 82 } else { 83 ++reqLength; 84 } 85 } 86 srcLimit = src; 87 if(ch != 0) { 88 /* "complicated" case, find the end of the remaining string */ 89 while(*++srcLimit != 0) {} 90 } 91 } else { 92 srcLimit = (src!=NULL)?(src + srcLength):NULL; 93 } 94 95 /* convert with length */ 96 while(src < srcLimit) { 97 ch = *src++; 98 do { 99 /* usually "loops" once; twice only for writing subchar */ 100 if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) { 101 if(pDest < destLimit) { 102 *pDest++ = (UChar)ch; 103 } else { 104 ++reqLength; 105 } 106 break; 107 } else if(0x10000 <= ch && ch <= 0x10ffff) { 108 if(pDest!=NULL && ((pDest + 2) <= destLimit)) { 109 *pDest++ = U16_LEAD(ch); 110 *pDest++ = U16_TRAIL(ch); 111 } else { 112 reqLength += 2; 113 } 114 break; 115 } else if((ch = subchar) < 0) { 116 /* surrogate code point, or not a Unicode code point at all */ 117 *pErrorCode = U_INVALID_CHAR_FOUND; 118 return NULL; 119 } else { 120 ++numSubstitutions; 121 } 122 } while(TRUE); 123 } 124 125 reqLength += (int32_t)(pDest - dest); 126 if(pDestLength) { 127 *pDestLength = reqLength; 128 } 129 if(pNumSubstitutions != NULL) { 130 *pNumSubstitutions = numSubstitutions; 131 } 132 133 /* Terminate the buffer */ 134 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 135 136 return dest; 137 } 138 139 U_CAPI UChar* U_EXPORT2 140 u_strFromUTF32(UChar *dest, 141 int32_t destCapacity, 142 int32_t *pDestLength, 143 const UChar32 *src, 144 int32_t srcLength, 145 UErrorCode *pErrorCode) { 146 return u_strFromUTF32WithSub( 147 dest, destCapacity, pDestLength, 148 src, srcLength, 149 U_SENTINEL, NULL, 150 pErrorCode); 151 } 152 153 U_CAPI UChar32* U_EXPORT2 154 u_strToUTF32WithSub(UChar32 *dest, 155 int32_t destCapacity, 156 int32_t *pDestLength, 157 const UChar *src, 158 int32_t srcLength, 159 UChar32 subchar, int32_t *pNumSubstitutions, 160 UErrorCode *pErrorCode) { 161 const UChar *srcLimit; 162 UChar32 ch; 163 UChar ch2; 164 UChar32 *destLimit; 165 UChar32 *pDest; 166 int32_t reqLength; 167 int32_t numSubstitutions; 168 169 /* args check */ 170 if(U_FAILURE(*pErrorCode)){ 171 return NULL; 172 } 173 if( (src==NULL && srcLength!=0) || srcLength < -1 || 174 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 175 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 176 ) { 177 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 178 return NULL; 179 } 180 181 if(pNumSubstitutions != NULL) { 182 *pNumSubstitutions = 0; 183 } 184 185 pDest = dest; 186 destLimit = (dest!=NULL)?(dest + destCapacity):NULL; 187 reqLength = 0; 188 numSubstitutions = 0; 189 190 if(srcLength < 0) { 191 /* simple loop for conversion of a NUL-terminated BMP string */ 192 while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) { 193 ++src; 194 if(pDest < destLimit) { 195 *pDest++ = ch; 196 } else { 197 ++reqLength; 198 } 199 } 200 srcLimit = src; 201 if(ch != 0) { 202 /* "complicated" case, find the end of the remaining string */ 203 while(*++srcLimit != 0) {} 204 } 205 } else { 206 srcLimit = (src!=NULL)?(src + srcLength):NULL; 207 } 208 209 /* convert with length */ 210 while(src < srcLimit) { 211 ch = *src++; 212 if(!U16_IS_SURROGATE(ch)) { 213 /* write or count ch below */ 214 } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) { 215 ++src; 216 ch = U16_GET_SUPPLEMENTARY(ch, ch2); 217 } else if((ch = subchar) < 0) { 218 /* unpaired surrogate */ 219 *pErrorCode = U_INVALID_CHAR_FOUND; 220 return NULL; 221 } else { 222 ++numSubstitutions; 223 } 224 if(pDest < destLimit) { 225 *pDest++ = ch; 226 } else { 227 ++reqLength; 228 } 229 } 230 231 reqLength += (int32_t)(pDest - dest); 232 if(pDestLength) { 233 *pDestLength = reqLength; 234 } 235 if(pNumSubstitutions != NULL) { 236 *pNumSubstitutions = numSubstitutions; 237 } 238 239 /* Terminate the buffer */ 240 u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode); 241 242 return dest; 243 } 244 245 U_CAPI UChar32* U_EXPORT2 246 u_strToUTF32(UChar32 *dest, 247 int32_t destCapacity, 248 int32_t *pDestLength, 249 const UChar *src, 250 int32_t srcLength, 251 UErrorCode *pErrorCode) { 252 return u_strToUTF32WithSub( 253 dest, destCapacity, pDestLength, 254 src, srcLength, 255 U_SENTINEL, NULL, 256 pErrorCode); 257 } 258 259 /* for utf8_nextCharSafeBodyTerminated() */ 260 static const UChar32 261 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; 262 263 /* 264 * Version of utf8_nextCharSafeBody() with the following differences: 265 * - checks for NUL termination instead of length 266 * - works with pointers instead of indexes 267 * - always strict (strict==-1) 268 * 269 * *ps points to after the lead byte and will be moved to after the last trail byte. 270 * c is the lead byte. 271 * @return the code point, or U_SENTINEL 272 */ 273 static UChar32 274 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) { 275 const uint8_t *s=*ps; 276 uint8_t trail, illegal=0; 277 uint8_t count=U8_COUNT_TRAIL_BYTES(c); 278 U_ASSERT(count<6); 279 U8_MASK_LEAD_BYTE((c), count); 280 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ 281 switch(count) { 282 /* each branch falls through to the next one */ 283 case 5: 284 case 4: 285 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ 286 illegal=1; 287 break; 288 case 3: 289 trail=(uint8_t)(*s++ - 0x80); 290 c=(c<<6)|trail; 291 if(trail>0x3f || c>=0x110) { 292 /* not a trail byte, or code point>0x10ffff (outside Unicode) */ 293 illegal=1; 294 break; 295 } 296 U_FALLTHROUGH; 297 case 2: 298 trail=(uint8_t)(*s++ - 0x80); 299 if(trail>0x3f) { 300 /* not a trail byte */ 301 illegal=1; 302 break; 303 } 304 c=(c<<6)|trail; 305 U_FALLTHROUGH; 306 case 1: 307 trail=(uint8_t)(*s++ - 0x80); 308 if(trail>0x3f) { 309 /* not a trail byte */ 310 illegal=1; 311 } 312 c=(c<<6)|trail; 313 break; 314 case 0: 315 return U_SENTINEL; 316 /* no default branch to optimize switch() - all values are covered */ 317 } 318 319 /* correct sequence - all trail bytes have (b7..b6)==(10)? */ 320 /* illegal is also set if count>=4 */ 321 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) { 322 /* error handling */ 323 /* don't go beyond this sequence */ 324 s=*ps; 325 while(count>0 && U8_IS_TRAIL(*s)) { 326 ++s; 327 --count; 328 } 329 c=U_SENTINEL; 330 } 331 *ps=s; 332 return c; 333 } 334 335 /* 336 * Version of utf8_nextCharSafeBody() with the following differences: 337 * - works with pointers instead of indexes 338 * - always strict (strict==-1) 339 * 340 * *ps points to after the lead byte and will be moved to after the last trail byte. 341 * c is the lead byte. 342 * @return the code point, or U_SENTINEL 343 */ 344 static UChar32 345 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) { 346 const uint8_t *s=*ps; 347 uint8_t trail, illegal=0; 348 uint8_t count=U8_COUNT_TRAIL_BYTES(c); 349 if((limit-s)>=count) { 350 U8_MASK_LEAD_BYTE((c), count); 351 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ 352 switch(count) { 353 /* each branch falls through to the next one */ 354 case 5: 355 case 4: 356 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ 357 illegal=1; 358 break; 359 case 3: 360 trail=*s++; 361 c=(c<<6)|(trail&0x3f); 362 if(c<0x110) { 363 illegal|=(trail&0xc0)^0x80; 364 } else { 365 /* code point>0x10ffff, outside Unicode */ 366 illegal=1; 367 break; 368 } 369 U_FALLTHROUGH; 370 case 2: 371 trail=*s++; 372 c=(c<<6)|(trail&0x3f); 373 illegal|=(trail&0xc0)^0x80; 374 U_FALLTHROUGH; 375 case 1: 376 trail=*s++; 377 c=(c<<6)|(trail&0x3f); 378 illegal|=(trail&0xc0)^0x80; 379 break; 380 case 0: 381 return U_SENTINEL; 382 /* no default branch to optimize switch() - all values are covered */ 383 } 384 } else { 385 illegal=1; /* too few bytes left */ 386 } 387 388 /* correct sequence - all trail bytes have (b7..b6)==(10)? */ 389 /* illegal is also set if count>=4 */ 390 U_ASSERT(illegal || count<UPRV_LENGTHOF(utf8_minLegal)); 391 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) { 392 /* error handling */ 393 /* don't go beyond this sequence */ 394 s=*ps; 395 while(count>0 && s<limit && U8_IS_TRAIL(*s)) { 396 ++s; 397 --count; 398 } 399 c=U_SENTINEL; 400 } 401 *ps=s; 402 return c; 403 } 404 405 U_CAPI UChar* U_EXPORT2 406 u_strFromUTF8WithSub(UChar *dest, 407 int32_t destCapacity, 408 int32_t *pDestLength, 409 const char* src, 410 int32_t srcLength, 411 UChar32 subchar, int32_t *pNumSubstitutions, 412 UErrorCode *pErrorCode){ 413 UChar *pDest = dest; 414 UChar *pDestLimit = dest+destCapacity; 415 UChar32 ch; 416 int32_t reqLength = 0; 417 const uint8_t* pSrc = (const uint8_t*) src; 418 uint8_t t1, t2; /* trail bytes */ 419 int32_t numSubstitutions; 420 421 /* args check */ 422 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 423 return NULL; 424 } 425 426 if( (src==NULL && srcLength!=0) || srcLength < -1 || 427 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 428 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 429 ) { 430 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 431 return NULL; 432 } 433 434 if(pNumSubstitutions!=NULL) { 435 *pNumSubstitutions=0; 436 } 437 numSubstitutions=0; 438 439 /* 440 * Inline processing of UTF-8 byte sequences: 441 * 442 * Byte sequences for the most common characters are handled inline in 443 * the conversion loops. In order to reduce the path lengths for those 444 * characters, the tests are arranged in a kind of binary search. 445 * ASCII (<=0x7f) is checked first, followed by the dividing point 446 * between 2- and 3-byte sequences (0xe0). 447 * The 3-byte branch is tested first to speed up CJK text. 448 * The compiler should combine the subtractions for the two tests for 0xe0. 449 * Each branch then tests for the other end of its range. 450 */ 451 452 if(srcLength < 0){ 453 /* 454 * Transform a NUL-terminated string. 455 * The code explicitly checks for NULs only in the lead byte position. 456 * A NUL byte in the trail byte position fails the trail byte range check anyway. 457 */ 458 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { 459 if(ch <= 0x7f){ 460 *pDest++=(UChar)ch; 461 ++pSrc; 462 } else { 463 if(ch > 0xe0) { 464 if( /* handle U+1000..U+CFFF inline */ 465 ch <= 0xec && 466 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 467 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 468 ) { 469 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 470 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 471 pSrc += 3; 472 continue; 473 } 474 } else if(ch < 0xe0) { 475 if( /* handle U+0080..U+07FF inline */ 476 ch >= 0xc2 && 477 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 478 ) { 479 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 480 pSrc += 2; 481 continue; 482 } 483 } 484 485 /* function call for "complicated" and error cases */ 486 ++pSrc; /* continue after the lead byte */ 487 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); 488 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { 489 *pErrorCode = U_INVALID_CHAR_FOUND; 490 return NULL; 491 } else if(ch<=0xFFFF) { 492 *(pDest++)=(UChar)ch; 493 } else { 494 *(pDest++)=U16_LEAD(ch); 495 if(pDest<pDestLimit) { 496 *(pDest++)=U16_TRAIL(ch); 497 } else { 498 reqLength++; 499 break; 500 } 501 } 502 } 503 } 504 505 /* Pre-flight the rest of the string. */ 506 while((ch = *pSrc) != 0) { 507 if(ch <= 0x7f){ 508 ++reqLength; 509 ++pSrc; 510 } else { 511 if(ch > 0xe0) { 512 if( /* handle U+1000..U+CFFF inline */ 513 ch <= 0xec && 514 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 515 (uint8_t)(pSrc[2] - 0x80) <= 0x3f 516 ) { 517 ++reqLength; 518 pSrc += 3; 519 continue; 520 } 521 } else if(ch < 0xe0) { 522 if( /* handle U+0080..U+07FF inline */ 523 ch >= 0xc2 && 524 (uint8_t)(pSrc[1] - 0x80) <= 0x3f 525 ) { 526 ++reqLength; 527 pSrc += 2; 528 continue; 529 } 530 } 531 532 /* function call for "complicated" and error cases */ 533 ++pSrc; /* continue after the lead byte */ 534 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); 535 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { 536 *pErrorCode = U_INVALID_CHAR_FOUND; 537 return NULL; 538 } 539 reqLength += U16_LENGTH(ch); 540 } 541 } 542 } else /* srcLength >= 0 */ { 543 const uint8_t *pSrcLimit = pSrc + srcLength; 544 int32_t count; 545 546 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 547 for(;;) { 548 /* 549 * Each iteration of the inner loop progresses by at most 3 UTF-8 550 * bytes and one UChar, for most characters. 551 * For supplementary code points (4 & 2), which are rare, 552 * there is an additional adjustment. 553 */ 554 count = (int32_t)(pDestLimit - pDest); 555 srcLength = (int32_t)((pSrcLimit - pSrc) / 3); 556 if(count > srcLength) { 557 count = srcLength; /* min(remaining dest, remaining src/3) */ 558 } 559 if(count < 3) { 560 /* 561 * Too much overhead if we get near the end of the string, 562 * continue with the next loop. 563 */ 564 break; 565 } 566 567 do { 568 ch = *pSrc; 569 if(ch <= 0x7f){ 570 *pDest++=(UChar)ch; 571 ++pSrc; 572 } else { 573 if(ch > 0xe0) { 574 if( /* handle U+1000..U+CFFF inline */ 575 ch <= 0xec && 576 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 577 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 578 ) { 579 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 580 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 581 pSrc += 3; 582 continue; 583 } 584 } else if(ch < 0xe0) { 585 if( /* handle U+0080..U+07FF inline */ 586 ch >= 0xc2 && 587 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 588 ) { 589 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 590 pSrc += 2; 591 continue; 592 } 593 } 594 595 if(ch >= 0xf0 || subchar > 0xffff) { 596 /* 597 * We may read up to six bytes and write up to two UChars, 598 * which we didn't account for with computing count, 599 * so we adjust it here. 600 */ 601 if(--count == 0) { 602 break; 603 } 604 } 605 606 /* function call for "complicated" and error cases */ 607 ++pSrc; /* continue after the lead byte */ 608 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 609 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 610 *pErrorCode = U_INVALID_CHAR_FOUND; 611 return NULL; 612 }else if(ch<=0xFFFF){ 613 *(pDest++)=(UChar)ch; 614 }else{ 615 *(pDest++)=U16_LEAD(ch); 616 *(pDest++)=U16_TRAIL(ch); 617 } 618 } 619 } while(--count > 0); 620 } 621 622 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { 623 ch = *pSrc; 624 if(ch <= 0x7f){ 625 *pDest++=(UChar)ch; 626 ++pSrc; 627 } else { 628 if(ch > 0xe0) { 629 if( /* handle U+1000..U+CFFF inline */ 630 ch <= 0xec && 631 ((pSrcLimit - pSrc) >= 3) && 632 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 633 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 634 ) { 635 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 636 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 637 pSrc += 3; 638 continue; 639 } 640 } else if(ch < 0xe0) { 641 if( /* handle U+0080..U+07FF inline */ 642 ch >= 0xc2 && 643 ((pSrcLimit - pSrc) >= 2) && 644 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 645 ) { 646 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 647 pSrc += 2; 648 continue; 649 } 650 } 651 652 /* function call for "complicated" and error cases */ 653 ++pSrc; /* continue after the lead byte */ 654 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 655 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 656 *pErrorCode = U_INVALID_CHAR_FOUND; 657 return NULL; 658 }else if(ch<=0xFFFF){ 659 *(pDest++)=(UChar)ch; 660 }else{ 661 *(pDest++)=U16_LEAD(ch); 662 if(pDest<pDestLimit){ 663 *(pDest++)=U16_TRAIL(ch); 664 }else{ 665 reqLength++; 666 break; 667 } 668 } 669 } 670 } 671 /* do not fill the dest buffer just count the UChars needed */ 672 while(pSrc < pSrcLimit){ 673 ch = *pSrc; 674 if(ch <= 0x7f){ 675 reqLength++; 676 ++pSrc; 677 } else { 678 if(ch > 0xe0) { 679 if( /* handle U+1000..U+CFFF inline */ 680 ch <= 0xec && 681 ((pSrcLimit - pSrc) >= 3) && 682 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 683 (uint8_t)(pSrc[2] - 0x80) <= 0x3f 684 ) { 685 reqLength++; 686 pSrc += 3; 687 continue; 688 } 689 } else if(ch < 0xe0) { 690 if( /* handle U+0080..U+07FF inline */ 691 ch >= 0xc2 && 692 ((pSrcLimit - pSrc) >= 2) && 693 (uint8_t)(pSrc[1] - 0x80) <= 0x3f 694 ) { 695 reqLength++; 696 pSrc += 2; 697 continue; 698 } 699 } 700 701 /* function call for "complicated" and error cases */ 702 ++pSrc; /* continue after the lead byte */ 703 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 704 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 705 *pErrorCode = U_INVALID_CHAR_FOUND; 706 return NULL; 707 } 708 reqLength+=U16_LENGTH(ch); 709 } 710 } 711 } 712 713 reqLength+=(int32_t)(pDest - dest); 714 715 if(pNumSubstitutions!=NULL) { 716 *pNumSubstitutions=numSubstitutions; 717 } 718 719 if(pDestLength){ 720 *pDestLength = reqLength; 721 } 722 723 /* Terminate the buffer */ 724 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 725 726 return dest; 727 } 728 729 U_CAPI UChar* U_EXPORT2 730 u_strFromUTF8(UChar *dest, 731 int32_t destCapacity, 732 int32_t *pDestLength, 733 const char* src, 734 int32_t srcLength, 735 UErrorCode *pErrorCode){ 736 return u_strFromUTF8WithSub( 737 dest, destCapacity, pDestLength, 738 src, srcLength, 739 U_SENTINEL, NULL, 740 pErrorCode); 741 } 742 743 U_CAPI UChar * U_EXPORT2 744 u_strFromUTF8Lenient(UChar *dest, 745 int32_t destCapacity, 746 int32_t *pDestLength, 747 const char *src, 748 int32_t srcLength, 749 UErrorCode *pErrorCode) { 750 UChar *pDest = dest; 751 UChar32 ch; 752 int32_t reqLength = 0; 753 uint8_t* pSrc = (uint8_t*) src; 754 755 /* args check */ 756 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 757 return NULL; 758 } 759 760 if( (src==NULL && srcLength!=0) || srcLength < -1 || 761 (destCapacity<0) || (dest == NULL && destCapacity > 0) 762 ) { 763 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 764 return NULL; 765 } 766 767 if(srcLength < 0) { 768 /* Transform a NUL-terminated string. */ 769 UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL; 770 uint8_t t1, t2, t3; /* trail bytes */ 771 772 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { 773 if(ch < 0xc0) { 774 /* 775 * ASCII, or a trail byte in lead position which is treated like 776 * a single-byte sequence for better character boundary 777 * resynchronization after illegal sequences. 778 */ 779 *pDest++=(UChar)ch; 780 ++pSrc; 781 continue; 782 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 783 if((t1 = pSrc[1]) != 0) { 784 /* 0x3080 = (0xc0 << 6) + 0x80 */ 785 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080); 786 pSrc += 2; 787 continue; 788 } 789 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 790 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) { 791 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 792 /* 0x2080 = (0x80 << 6) + 0x80 */ 793 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080); 794 pSrc += 3; 795 continue; 796 } 797 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 798 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) { 799 pSrc += 4; 800 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 801 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080; 802 *(pDest++) = U16_LEAD(ch); 803 if(pDest < pDestLimit) { 804 *(pDest++) = U16_TRAIL(ch); 805 } else { 806 reqLength = 1; 807 break; 808 } 809 continue; 810 } 811 } 812 813 /* truncated character at the end */ 814 *pDest++ = 0xfffd; 815 while(*++pSrc != 0) {} 816 break; 817 } 818 819 /* Pre-flight the rest of the string. */ 820 while((ch = *pSrc) != 0) { 821 if(ch < 0xc0) { 822 /* 823 * ASCII, or a trail byte in lead position which is treated like 824 * a single-byte sequence for better character boundary 825 * resynchronization after illegal sequences. 826 */ 827 ++reqLength; 828 ++pSrc; 829 continue; 830 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 831 if(pSrc[1] != 0) { 832 ++reqLength; 833 pSrc += 2; 834 continue; 835 } 836 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 837 if(pSrc[1] != 0 && pSrc[2] != 0) { 838 ++reqLength; 839 pSrc += 3; 840 continue; 841 } 842 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 843 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) { 844 reqLength += 2; 845 pSrc += 4; 846 continue; 847 } 848 } 849 850 /* truncated character at the end */ 851 ++reqLength; 852 break; 853 } 854 } else /* srcLength >= 0 */ { 855 const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL; 856 857 /* 858 * This function requires that if srcLength is given, then it must be 859 * destCapatity >= srcLength so that we need not check for 860 * destination buffer overflow in the loop. 861 */ 862 if(destCapacity < srcLength) { 863 if(pDestLength != NULL) { 864 *pDestLength = srcLength; /* this likely overestimates the true destLength! */ 865 } 866 *pErrorCode = U_BUFFER_OVERFLOW_ERROR; 867 return NULL; 868 } 869 870 if((pSrcLimit - pSrc) >= 4) { 871 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */ 872 873 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */ 874 do { 875 ch = *pSrc++; 876 if(ch < 0xc0) { 877 /* 878 * ASCII, or a trail byte in lead position which is treated like 879 * a single-byte sequence for better character boundary 880 * resynchronization after illegal sequences. 881 */ 882 *pDest++=(UChar)ch; 883 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 884 /* 0x3080 = (0xc0 << 6) + 0x80 */ 885 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); 886 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 887 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 888 /* 0x2080 = (0x80 << 6) + 0x80 */ 889 ch = (ch << 12) + (*pSrc++ << 6); 890 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); 891 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 892 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 893 ch = (ch << 18) + (*pSrc++ << 12); 894 ch += *pSrc++ << 6; 895 ch += *pSrc++ - 0x3c82080; 896 *(pDest++) = U16_LEAD(ch); 897 *(pDest++) = U16_TRAIL(ch); 898 } 899 } while(pSrc < pSrcLimit); 900 901 pSrcLimit += 3; /* restore original pSrcLimit */ 902 } 903 904 while(pSrc < pSrcLimit) { 905 ch = *pSrc++; 906 if(ch < 0xc0) { 907 /* 908 * ASCII, or a trail byte in lead position which is treated like 909 * a single-byte sequence for better character boundary 910 * resynchronization after illegal sequences. 911 */ 912 *pDest++=(UChar)ch; 913 continue; 914 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 915 if(pSrc < pSrcLimit) { 916 /* 0x3080 = (0xc0 << 6) + 0x80 */ 917 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); 918 continue; 919 } 920 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 921 if((pSrcLimit - pSrc) >= 2) { 922 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 923 /* 0x2080 = (0x80 << 6) + 0x80 */ 924 ch = (ch << 12) + (*pSrc++ << 6); 925 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); 926 pSrc += 3; 927 continue; 928 } 929 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 930 if((pSrcLimit - pSrc) >= 3) { 931 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 932 ch = (ch << 18) + (*pSrc++ << 12); 933 ch += *pSrc++ << 6; 934 ch += *pSrc++ - 0x3c82080; 935 *(pDest++) = U16_LEAD(ch); 936 *(pDest++) = U16_TRAIL(ch); 937 pSrc += 4; 938 continue; 939 } 940 } 941 942 /* truncated character at the end */ 943 *pDest++ = 0xfffd; 944 break; 945 } 946 } 947 948 reqLength+=(int32_t)(pDest - dest); 949 950 if(pDestLength){ 951 *pDestLength = reqLength; 952 } 953 954 /* Terminate the buffer */ 955 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 956 957 return dest; 958 } 959 960 static inline uint8_t * 961 _appendUTF8(uint8_t *pDest, UChar32 c) { 962 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */ 963 if((c)<=0x7f) { 964 *pDest++=(uint8_t)c; 965 } else if(c<=0x7ff) { 966 *pDest++=(uint8_t)((c>>6)|0xc0); 967 *pDest++=(uint8_t)((c&0x3f)|0x80); 968 } else if(c<=0xffff) { 969 *pDest++=(uint8_t)((c>>12)|0xe0); 970 *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80); 971 *pDest++=(uint8_t)(((c)&0x3f)|0x80); 972 } else /* if((uint32_t)(c)<=0x10ffff) */ { 973 *pDest++=(uint8_t)(((c)>>18)|0xf0); 974 *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80); 975 *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80); 976 *pDest++=(uint8_t)(((c)&0x3f)|0x80); 977 } 978 return pDest; 979 } 980 981 982 U_CAPI char* U_EXPORT2 983 u_strToUTF8WithSub(char *dest, 984 int32_t destCapacity, 985 int32_t *pDestLength, 986 const UChar *pSrc, 987 int32_t srcLength, 988 UChar32 subchar, int32_t *pNumSubstitutions, 989 UErrorCode *pErrorCode){ 990 int32_t reqLength=0; 991 uint32_t ch=0,ch2=0; 992 uint8_t *pDest = (uint8_t *)dest; 993 uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL; 994 int32_t numSubstitutions; 995 996 /* args check */ 997 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 998 return NULL; 999 } 1000 1001 if( (pSrc==NULL && srcLength!=0) || srcLength < -1 || 1002 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 1003 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 1004 ) { 1005 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1006 return NULL; 1007 } 1008 1009 if(pNumSubstitutions!=NULL) { 1010 *pNumSubstitutions=0; 1011 } 1012 numSubstitutions=0; 1013 1014 if(srcLength==-1) { 1015 while((ch=*pSrc)!=0) { 1016 ++pSrc; 1017 if(ch <= 0x7f) { 1018 if(pDest<pDestLimit) { 1019 *pDest++ = (uint8_t)ch; 1020 } else { 1021 reqLength = 1; 1022 break; 1023 } 1024 } else if(ch <= 0x7ff) { 1025 if((pDestLimit - pDest) >= 2) { 1026 *pDest++=(uint8_t)((ch>>6)|0xc0); 1027 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1028 } else { 1029 reqLength = 2; 1030 break; 1031 } 1032 } else if(ch <= 0xd7ff || ch >= 0xe000) { 1033 if((pDestLimit - pDest) >= 3) { 1034 *pDest++=(uint8_t)((ch>>12)|0xe0); 1035 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1036 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1037 } else { 1038 reqLength = 3; 1039 break; 1040 } 1041 } else /* ch is a surrogate */ { 1042 int32_t length; 1043 1044 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/ 1045 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 1046 ++pSrc; 1047 ch=U16_GET_SUPPLEMENTARY(ch, ch2); 1048 } else if(subchar>=0) { 1049 ch=subchar; 1050 ++numSubstitutions; 1051 } else { 1052 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1053 *pErrorCode = U_INVALID_CHAR_FOUND; 1054 return NULL; 1055 } 1056 1057 length = U8_LENGTH(ch); 1058 if((pDestLimit - pDest) >= length) { 1059 /* convert and append*/ 1060 pDest=_appendUTF8(pDest, ch); 1061 } else { 1062 reqLength = length; 1063 break; 1064 } 1065 } 1066 } 1067 while((ch=*pSrc++)!=0) { 1068 if(ch<=0x7f) { 1069 ++reqLength; 1070 } else if(ch<=0x7ff) { 1071 reqLength+=2; 1072 } else if(!U16_IS_SURROGATE(ch)) { 1073 reqLength+=3; 1074 } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 1075 ++pSrc; 1076 reqLength+=4; 1077 } else if(subchar>=0) { 1078 reqLength+=U8_LENGTH(subchar); 1079 ++numSubstitutions; 1080 } else { 1081 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1082 *pErrorCode = U_INVALID_CHAR_FOUND; 1083 return NULL; 1084 } 1085 } 1086 } else { 1087 const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL; 1088 int32_t count; 1089 1090 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1091 for(;;) { 1092 /* 1093 * Each iteration of the inner loop progresses by at most 3 UTF-8 1094 * bytes and one UChar, for most characters. 1095 * For supplementary code points (4 & 2), which are rare, 1096 * there is an additional adjustment. 1097 */ 1098 count = (int32_t)((pDestLimit - pDest) / 3); 1099 srcLength = (int32_t)(pSrcLimit - pSrc); 1100 if(count > srcLength) { 1101 count = srcLength; /* min(remaining dest/3, remaining src) */ 1102 } 1103 if(count < 3) { 1104 /* 1105 * Too much overhead if we get near the end of the string, 1106 * continue with the next loop. 1107 */ 1108 break; 1109 } 1110 do { 1111 ch=*pSrc++; 1112 if(ch <= 0x7f) { 1113 *pDest++ = (uint8_t)ch; 1114 } else if(ch <= 0x7ff) { 1115 *pDest++=(uint8_t)((ch>>6)|0xc0); 1116 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1117 } else if(ch <= 0xd7ff || ch >= 0xe000) { 1118 *pDest++=(uint8_t)((ch>>12)|0xe0); 1119 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1120 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1121 } else /* ch is a surrogate */ { 1122 /* 1123 * We will read two UChars and probably output four bytes, 1124 * which we didn't account for with computing count, 1125 * so we adjust it here. 1126 */ 1127 if(--count == 0) { 1128 --pSrc; /* undo ch=*pSrc++ for the lead surrogate */ 1129 break; /* recompute count */ 1130 } 1131 1132 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 1133 ++pSrc; 1134 ch=U16_GET_SUPPLEMENTARY(ch, ch2); 1135 1136 /* writing 4 bytes per 2 UChars is ok */ 1137 *pDest++=(uint8_t)((ch>>18)|0xf0); 1138 *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80); 1139 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1140 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1141 } else { 1142 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1143 if(subchar>=0) { 1144 ch=subchar; 1145 ++numSubstitutions; 1146 } else { 1147 *pErrorCode = U_INVALID_CHAR_FOUND; 1148 return NULL; 1149 } 1150 1151 /* convert and append*/ 1152 pDest=_appendUTF8(pDest, ch); 1153 } 1154 } 1155 } while(--count > 0); 1156 } 1157 1158 while(pSrc<pSrcLimit) { 1159 ch=*pSrc++; 1160 if(ch <= 0x7f) { 1161 if(pDest<pDestLimit) { 1162 *pDest++ = (uint8_t)ch; 1163 } else { 1164 reqLength = 1; 1165 break; 1166 } 1167 } else if(ch <= 0x7ff) { 1168 if((pDestLimit - pDest) >= 2) { 1169 *pDest++=(uint8_t)((ch>>6)|0xc0); 1170 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1171 } else { 1172 reqLength = 2; 1173 break; 1174 } 1175 } else if(ch <= 0xd7ff || ch >= 0xe000) { 1176 if((pDestLimit - pDest) >= 3) { 1177 *pDest++=(uint8_t)((ch>>12)|0xe0); 1178 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1179 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1180 } else { 1181 reqLength = 3; 1182 break; 1183 } 1184 } else /* ch is a surrogate */ { 1185 int32_t length; 1186 1187 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { 1188 ++pSrc; 1189 ch=U16_GET_SUPPLEMENTARY(ch, ch2); 1190 } else if(subchar>=0) { 1191 ch=subchar; 1192 ++numSubstitutions; 1193 } else { 1194 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1195 *pErrorCode = U_INVALID_CHAR_FOUND; 1196 return NULL; 1197 } 1198 1199 length = U8_LENGTH(ch); 1200 if((pDestLimit - pDest) >= length) { 1201 /* convert and append*/ 1202 pDest=_appendUTF8(pDest, ch); 1203 } else { 1204 reqLength = length; 1205 break; 1206 } 1207 } 1208 } 1209 while(pSrc<pSrcLimit) { 1210 ch=*pSrc++; 1211 if(ch<=0x7f) { 1212 ++reqLength; 1213 } else if(ch<=0x7ff) { 1214 reqLength+=2; 1215 } else if(!U16_IS_SURROGATE(ch)) { 1216 reqLength+=3; 1217 } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { 1218 ++pSrc; 1219 reqLength+=4; 1220 } else if(subchar>=0) { 1221 reqLength+=U8_LENGTH(subchar); 1222 ++numSubstitutions; 1223 } else { 1224 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1225 *pErrorCode = U_INVALID_CHAR_FOUND; 1226 return NULL; 1227 } 1228 } 1229 } 1230 1231 reqLength+=(int32_t)(pDest - (uint8_t *)dest); 1232 1233 if(pNumSubstitutions!=NULL) { 1234 *pNumSubstitutions=numSubstitutions; 1235 } 1236 1237 if(pDestLength){ 1238 *pDestLength = reqLength; 1239 } 1240 1241 /* Terminate the buffer */ 1242 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1243 return dest; 1244 } 1245 1246 U_CAPI char* U_EXPORT2 1247 u_strToUTF8(char *dest, 1248 int32_t destCapacity, 1249 int32_t *pDestLength, 1250 const UChar *pSrc, 1251 int32_t srcLength, 1252 UErrorCode *pErrorCode){ 1253 return u_strToUTF8WithSub( 1254 dest, destCapacity, pDestLength, 1255 pSrc, srcLength, 1256 U_SENTINEL, NULL, 1257 pErrorCode); 1258 } 1259 1260 U_CAPI UChar* U_EXPORT2 1261 u_strFromJavaModifiedUTF8WithSub( 1262 UChar *dest, 1263 int32_t destCapacity, 1264 int32_t *pDestLength, 1265 const char *src, 1266 int32_t srcLength, 1267 UChar32 subchar, int32_t *pNumSubstitutions, 1268 UErrorCode *pErrorCode) { 1269 UChar *pDest = dest; 1270 UChar *pDestLimit = dest+destCapacity; 1271 UChar32 ch; 1272 int32_t reqLength = 0; 1273 const uint8_t* pSrc = (const uint8_t*) src; 1274 const uint8_t *pSrcLimit; 1275 int32_t count; 1276 uint8_t t1, t2; /* trail bytes */ 1277 int32_t numSubstitutions; 1278 1279 /* args check */ 1280 if(U_FAILURE(*pErrorCode)){ 1281 return NULL; 1282 } 1283 if( (src==NULL && srcLength!=0) || srcLength < -1 || 1284 (dest==NULL && destCapacity!=0) || destCapacity<0 || 1285 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 1286 ) { 1287 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1288 return NULL; 1289 } 1290 1291 if(pNumSubstitutions!=NULL) { 1292 *pNumSubstitutions=0; 1293 } 1294 numSubstitutions=0; 1295 1296 if(srcLength < 0) { 1297 /* 1298 * Transform a NUL-terminated ASCII string. 1299 * Handle non-ASCII strings with slower code. 1300 */ 1301 while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) { 1302 *pDest++=(UChar)ch; 1303 ++pSrc; 1304 } 1305 if(ch == 0) { 1306 reqLength=(int32_t)(pDest - dest); 1307 if(pDestLength) { 1308 *pDestLength = reqLength; 1309 } 1310 1311 /* Terminate the buffer */ 1312 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 1313 return dest; 1314 } 1315 srcLength = uprv_strlen((const char *)pSrc); 1316 } 1317 1318 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1319 pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength; 1320 for(;;) { 1321 count = (int32_t)(pDestLimit - pDest); 1322 srcLength = (int32_t)(pSrcLimit - pSrc); 1323 if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) { 1324 /* fast ASCII loop */ 1325 const uint8_t *prevSrc = pSrc; 1326 int32_t delta; 1327 while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) { 1328 *pDest++=(UChar)ch; 1329 ++pSrc; 1330 } 1331 delta = (int32_t)(pSrc - prevSrc); 1332 count -= delta; 1333 srcLength -= delta; 1334 } 1335 /* 1336 * Each iteration of the inner loop progresses by at most 3 UTF-8 1337 * bytes and one UChar. 1338 */ 1339 srcLength /= 3; 1340 if(count > srcLength) { 1341 count = srcLength; /* min(remaining dest, remaining src/3) */ 1342 } 1343 if(count < 3) { 1344 /* 1345 * Too much overhead if we get near the end of the string, 1346 * continue with the next loop. 1347 */ 1348 break; 1349 } 1350 do { 1351 ch = *pSrc; 1352 if(ch <= 0x7f){ 1353 *pDest++=(UChar)ch; 1354 ++pSrc; 1355 } else { 1356 if(ch >= 0xe0) { 1357 if( /* handle U+0000..U+FFFF inline */ 1358 ch <= 0xef && 1359 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 1360 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 1361 ) { 1362 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1363 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 1364 pSrc += 3; 1365 continue; 1366 } 1367 } else { 1368 if( /* handle U+0000..U+07FF inline */ 1369 ch >= 0xc0 && 1370 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 1371 ) { 1372 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 1373 pSrc += 2; 1374 continue; 1375 } 1376 } 1377 1378 if(subchar < 0) { 1379 *pErrorCode = U_INVALID_CHAR_FOUND; 1380 return NULL; 1381 } else if(subchar > 0xffff && --count == 0) { 1382 /* 1383 * We need to write two UChars, adjusted count for that, 1384 * and ran out of space. 1385 */ 1386 break; 1387 } else { 1388 /* function call for error cases */ 1389 ++pSrc; /* continue after the lead byte */ 1390 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1391 ++numSubstitutions; 1392 if(subchar<=0xFFFF) { 1393 *(pDest++)=(UChar)subchar; 1394 } else { 1395 *(pDest++)=U16_LEAD(subchar); 1396 *(pDest++)=U16_TRAIL(subchar); 1397 } 1398 } 1399 } 1400 } while(--count > 0); 1401 } 1402 1403 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { 1404 ch = *pSrc; 1405 if(ch <= 0x7f){ 1406 *pDest++=(UChar)ch; 1407 ++pSrc; 1408 } else { 1409 if(ch >= 0xe0) { 1410 if( /* handle U+0000..U+FFFF inline */ 1411 ch <= 0xef && 1412 ((pSrcLimit - pSrc) >= 3) && 1413 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 1414 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 1415 ) { 1416 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1417 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 1418 pSrc += 3; 1419 continue; 1420 } 1421 } else { 1422 if( /* handle U+0000..U+07FF inline */ 1423 ch >= 0xc0 && 1424 ((pSrcLimit - pSrc) >= 2) && 1425 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 1426 ) { 1427 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 1428 pSrc += 2; 1429 continue; 1430 } 1431 } 1432 1433 if(subchar < 0) { 1434 *pErrorCode = U_INVALID_CHAR_FOUND; 1435 return NULL; 1436 } else { 1437 /* function call for error cases */ 1438 ++pSrc; /* continue after the lead byte */ 1439 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1440 ++numSubstitutions; 1441 if(subchar<=0xFFFF) { 1442 *(pDest++)=(UChar)subchar; 1443 } else { 1444 *(pDest++)=U16_LEAD(subchar); 1445 if(pDest<pDestLimit) { 1446 *(pDest++)=U16_TRAIL(subchar); 1447 } else { 1448 reqLength++; 1449 break; 1450 } 1451 } 1452 } 1453 } 1454 } 1455 1456 /* do not fill the dest buffer just count the UChars needed */ 1457 while(pSrc < pSrcLimit){ 1458 ch = *pSrc; 1459 if(ch <= 0x7f) { 1460 reqLength++; 1461 ++pSrc; 1462 } else { 1463 if(ch >= 0xe0) { 1464 if( /* handle U+0000..U+FFFF inline */ 1465 ch <= 0xef && 1466 ((pSrcLimit - pSrc) >= 3) && 1467 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 1468 (uint8_t)(pSrc[2] - 0x80) <= 0x3f 1469 ) { 1470 reqLength++; 1471 pSrc += 3; 1472 continue; 1473 } 1474 } else { 1475 if( /* handle U+0000..U+07FF inline */ 1476 ch >= 0xc0 && 1477 ((pSrcLimit - pSrc) >= 2) && 1478 (uint8_t)(pSrc[1] - 0x80) <= 0x3f 1479 ) { 1480 reqLength++; 1481 pSrc += 2; 1482 continue; 1483 } 1484 } 1485 1486 if(subchar < 0) { 1487 *pErrorCode = U_INVALID_CHAR_FOUND; 1488 return NULL; 1489 } else { 1490 /* function call for error cases */ 1491 ++pSrc; /* continue after the lead byte */ 1492 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1493 ++numSubstitutions; 1494 reqLength+=U16_LENGTH(ch); 1495 } 1496 } 1497 } 1498 1499 if(pNumSubstitutions!=NULL) { 1500 *pNumSubstitutions=numSubstitutions; 1501 } 1502 1503 reqLength+=(int32_t)(pDest - dest); 1504 if(pDestLength) { 1505 *pDestLength = reqLength; 1506 } 1507 1508 /* Terminate the buffer */ 1509 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 1510 return dest; 1511 } 1512 1513 U_CAPI char* U_EXPORT2 1514 u_strToJavaModifiedUTF8( 1515 char *dest, 1516 int32_t destCapacity, 1517 int32_t *pDestLength, 1518 const UChar *src, 1519 int32_t srcLength, 1520 UErrorCode *pErrorCode) { 1521 int32_t reqLength=0; 1522 uint32_t ch=0; 1523 uint8_t *pDest = (uint8_t *)dest; 1524 uint8_t *pDestLimit = pDest + destCapacity; 1525 const UChar *pSrcLimit; 1526 int32_t count; 1527 1528 /* args check */ 1529 if(U_FAILURE(*pErrorCode)){ 1530 return NULL; 1531 } 1532 if( (src==NULL && srcLength!=0) || srcLength < -1 || 1533 (dest==NULL && destCapacity!=0) || destCapacity<0 1534 ) { 1535 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1536 return NULL; 1537 } 1538 1539 if(srcLength==-1) { 1540 /* Convert NUL-terminated ASCII, then find the string length. */ 1541 while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) { 1542 *pDest++ = (uint8_t)ch; 1543 ++src; 1544 } 1545 if(ch == 0) { 1546 reqLength=(int32_t)(pDest - (uint8_t *)dest); 1547 if(pDestLength) { 1548 *pDestLength = reqLength; 1549 } 1550 1551 /* Terminate the buffer */ 1552 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1553 return dest; 1554 } 1555 srcLength = u_strlen(src); 1556 } 1557 1558 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1559 pSrcLimit = (src!=NULL)?(src+srcLength):NULL; 1560 for(;;) { 1561 count = (int32_t)(pDestLimit - pDest); 1562 srcLength = (int32_t)(pSrcLimit - src); 1563 if(count >= srcLength && srcLength > 0 && *src <= 0x7f) { 1564 /* fast ASCII loop */ 1565 const UChar *prevSrc = src; 1566 int32_t delta; 1567 while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) { 1568 *pDest++=(uint8_t)ch; 1569 ++src; 1570 } 1571 delta = (int32_t)(src - prevSrc); 1572 count -= delta; 1573 srcLength -= delta; 1574 } 1575 /* 1576 * Each iteration of the inner loop progresses by at most 3 UTF-8 1577 * bytes and one UChar. 1578 */ 1579 count /= 3; 1580 if(count > srcLength) { 1581 count = srcLength; /* min(remaining dest/3, remaining src) */ 1582 } 1583 if(count < 3) { 1584 /* 1585 * Too much overhead if we get near the end of the string, 1586 * continue with the next loop. 1587 */ 1588 break; 1589 } 1590 do { 1591 ch=*src++; 1592 if(ch <= 0x7f && ch != 0) { 1593 *pDest++ = (uint8_t)ch; 1594 } else if(ch <= 0x7ff) { 1595 *pDest++=(uint8_t)((ch>>6)|0xc0); 1596 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1597 } else { 1598 *pDest++=(uint8_t)((ch>>12)|0xe0); 1599 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1600 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1601 } 1602 } while(--count > 0); 1603 } 1604 1605 while(src<pSrcLimit) { 1606 ch=*src++; 1607 if(ch <= 0x7f && ch != 0) { 1608 if(pDest<pDestLimit) { 1609 *pDest++ = (uint8_t)ch; 1610 } else { 1611 reqLength = 1; 1612 break; 1613 } 1614 } else if(ch <= 0x7ff) { 1615 if((pDestLimit - pDest) >= 2) { 1616 *pDest++=(uint8_t)((ch>>6)|0xc0); 1617 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1618 } else { 1619 reqLength = 2; 1620 break; 1621 } 1622 } else { 1623 if((pDestLimit - pDest) >= 3) { 1624 *pDest++=(uint8_t)((ch>>12)|0xe0); 1625 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1626 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1627 } else { 1628 reqLength = 3; 1629 break; 1630 } 1631 } 1632 } 1633 while(src<pSrcLimit) { 1634 ch=*src++; 1635 if(ch <= 0x7f && ch != 0) { 1636 ++reqLength; 1637 } else if(ch<=0x7ff) { 1638 reqLength+=2; 1639 } else { 1640 reqLength+=3; 1641 } 1642 } 1643 1644 reqLength+=(int32_t)(pDest - (uint8_t *)dest); 1645 if(pDestLength){ 1646 *pDestLength = reqLength; 1647 } 1648 1649 /* Terminate the buffer */ 1650 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1651 return dest; 1652 } 1653