1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2001-2012, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * 9 * File ustrtrns.cpp 10 * 11 * Modification History: 12 * 13 * Date Name Description 14 * 9/10/2001 Ram Creation. 15 ****************************************************************************** 16 */ 17 18 /******************************************************************************* 19 * 20 * u_strTo* and u_strFrom* APIs 21 * WCS functions moved to ustr_wcs.c for better modularization 22 * 23 ******************************************************************************* 24 */ 25 26 27 #include "unicode/putil.h" 28 #include "unicode/ustring.h" 29 #include "unicode/utf.h" 30 #include "unicode/utf8.h" 31 #include "unicode/utf16.h" 32 #include "cstring.h" 33 #include "cmemory.h" 34 #include "ustr_imp.h" 35 #include "uassert.h" 36 37 U_CAPI UChar* U_EXPORT2 38 u_strFromUTF32WithSub(UChar *dest, 39 int32_t destCapacity, 40 int32_t *pDestLength, 41 const UChar32 *src, 42 int32_t srcLength, 43 UChar32 subchar, int32_t *pNumSubstitutions, 44 UErrorCode *pErrorCode) { 45 const UChar32 *srcLimit; 46 UChar32 ch; 47 UChar *destLimit; 48 UChar *pDest; 49 int32_t reqLength; 50 int32_t numSubstitutions; 51 52 /* args check */ 53 if(U_FAILURE(*pErrorCode)){ 54 return NULL; 55 } 56 if( (src==NULL && srcLength!=0) || srcLength < -1 || 57 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 58 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 59 ) { 60 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 61 return NULL; 62 } 63 64 if(pNumSubstitutions != NULL) { 65 *pNumSubstitutions = 0; 66 } 67 68 pDest = dest; 69 destLimit = (dest!=NULL)?(dest + destCapacity):NULL; 70 reqLength = 0; 71 numSubstitutions = 0; 72 73 if(srcLength < 0) { 74 /* simple loop for conversion of a NUL-terminated BMP string */ 75 while((ch=*src) != 0 && 76 ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) { 77 ++src; 78 if(pDest < destLimit) { 79 *pDest++ = (UChar)ch; 80 } else { 81 ++reqLength; 82 } 83 } 84 srcLimit = src; 85 if(ch != 0) { 86 /* "complicated" case, find the end of the remaining string */ 87 while(*++srcLimit != 0) {} 88 } 89 } else { 90 srcLimit = (src!=NULL)?(src + srcLength):NULL; 91 } 92 93 /* convert with length */ 94 while(src < srcLimit) { 95 ch = *src++; 96 do { 97 /* usually "loops" once; twice only for writing subchar */ 98 if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) { 99 if(pDest < destLimit) { 100 *pDest++ = (UChar)ch; 101 } else { 102 ++reqLength; 103 } 104 break; 105 } else if(0x10000 <= ch && ch <= 0x10ffff) { 106 if(pDest!=NULL && ((pDest + 2) <= destLimit)) { 107 *pDest++ = U16_LEAD(ch); 108 *pDest++ = U16_TRAIL(ch); 109 } else { 110 reqLength += 2; 111 } 112 break; 113 } else if((ch = subchar) < 0) { 114 /* surrogate code point, or not a Unicode code point at all */ 115 *pErrorCode = U_INVALID_CHAR_FOUND; 116 return NULL; 117 } else { 118 ++numSubstitutions; 119 } 120 } while(TRUE); 121 } 122 123 reqLength += (int32_t)(pDest - dest); 124 if(pDestLength) { 125 *pDestLength = reqLength; 126 } 127 if(pNumSubstitutions != NULL) { 128 *pNumSubstitutions = numSubstitutions; 129 } 130 131 /* Terminate the buffer */ 132 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 133 134 return dest; 135 } 136 137 U_CAPI UChar* U_EXPORT2 138 u_strFromUTF32(UChar *dest, 139 int32_t destCapacity, 140 int32_t *pDestLength, 141 const UChar32 *src, 142 int32_t srcLength, 143 UErrorCode *pErrorCode) { 144 return u_strFromUTF32WithSub( 145 dest, destCapacity, pDestLength, 146 src, srcLength, 147 U_SENTINEL, NULL, 148 pErrorCode); 149 } 150 151 U_CAPI UChar32* U_EXPORT2 152 u_strToUTF32WithSub(UChar32 *dest, 153 int32_t destCapacity, 154 int32_t *pDestLength, 155 const UChar *src, 156 int32_t srcLength, 157 UChar32 subchar, int32_t *pNumSubstitutions, 158 UErrorCode *pErrorCode) { 159 const UChar *srcLimit; 160 UChar32 ch; 161 UChar ch2; 162 UChar32 *destLimit; 163 UChar32 *pDest; 164 int32_t reqLength; 165 int32_t numSubstitutions; 166 167 /* args check */ 168 if(U_FAILURE(*pErrorCode)){ 169 return NULL; 170 } 171 if( (src==NULL && srcLength!=0) || srcLength < -1 || 172 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 173 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 174 ) { 175 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 176 return NULL; 177 } 178 179 if(pNumSubstitutions != NULL) { 180 *pNumSubstitutions = 0; 181 } 182 183 pDest = dest; 184 destLimit = (dest!=NULL)?(dest + destCapacity):NULL; 185 reqLength = 0; 186 numSubstitutions = 0; 187 188 if(srcLength < 0) { 189 /* simple loop for conversion of a NUL-terminated BMP string */ 190 while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) { 191 ++src; 192 if(pDest < destLimit) { 193 *pDest++ = ch; 194 } else { 195 ++reqLength; 196 } 197 } 198 srcLimit = src; 199 if(ch != 0) { 200 /* "complicated" case, find the end of the remaining string */ 201 while(*++srcLimit != 0) {} 202 } 203 } else { 204 srcLimit = (src!=NULL)?(src + srcLength):NULL; 205 } 206 207 /* convert with length */ 208 while(src < srcLimit) { 209 ch = *src++; 210 if(!U16_IS_SURROGATE(ch)) { 211 /* write or count ch below */ 212 } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) { 213 ++src; 214 ch = U16_GET_SUPPLEMENTARY(ch, ch2); 215 } else if((ch = subchar) < 0) { 216 /* unpaired surrogate */ 217 *pErrorCode = U_INVALID_CHAR_FOUND; 218 return NULL; 219 } else { 220 ++numSubstitutions; 221 } 222 if(pDest < destLimit) { 223 *pDest++ = ch; 224 } else { 225 ++reqLength; 226 } 227 } 228 229 reqLength += (int32_t)(pDest - dest); 230 if(pDestLength) { 231 *pDestLength = reqLength; 232 } 233 if(pNumSubstitutions != NULL) { 234 *pNumSubstitutions = numSubstitutions; 235 } 236 237 /* Terminate the buffer */ 238 u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode); 239 240 return dest; 241 } 242 243 U_CAPI UChar32* U_EXPORT2 244 u_strToUTF32(UChar32 *dest, 245 int32_t destCapacity, 246 int32_t *pDestLength, 247 const UChar *src, 248 int32_t srcLength, 249 UErrorCode *pErrorCode) { 250 return u_strToUTF32WithSub( 251 dest, destCapacity, pDestLength, 252 src, srcLength, 253 U_SENTINEL, NULL, 254 pErrorCode); 255 } 256 257 /* for utf8_nextCharSafeBodyTerminated() */ 258 static const UChar32 259 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; 260 261 /* 262 * Version of utf8_nextCharSafeBody() with the following differences: 263 * - checks for NUL termination instead of length 264 * - works with pointers instead of indexes 265 * - always strict (strict==-1) 266 * 267 * *ps points to after the lead byte and will be moved to after the last trail byte. 268 * c is the lead byte. 269 * @return the code point, or U_SENTINEL 270 */ 271 static UChar32 272 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) { 273 const uint8_t *s=*ps; 274 uint8_t trail, illegal=0; 275 uint8_t count=U8_COUNT_TRAIL_BYTES(c); 276 U_ASSERT(count<6); 277 U8_MASK_LEAD_BYTE((c), count); 278 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ 279 switch(count) { 280 /* each branch falls through to the next one */ 281 case 5: 282 case 4: 283 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ 284 illegal=1; 285 break; 286 case 3: 287 trail=(uint8_t)(*s++ - 0x80); 288 c=(c<<6)|trail; 289 if(trail>0x3f || c>=0x110) { 290 /* not a trail byte, or code point>0x10ffff (outside Unicode) */ 291 illegal=1; 292 break; 293 } 294 case 2: /*fall through*/ 295 trail=(uint8_t)(*s++ - 0x80); 296 if(trail>0x3f) { 297 /* not a trail byte */ 298 illegal=1; 299 break; 300 } 301 c=(c<<6)|trail; 302 case 1: /*fall through*/ 303 trail=(uint8_t)(*s++ - 0x80); 304 if(trail>0x3f) { 305 /* not a trail byte */ 306 illegal=1; 307 } 308 c=(c<<6)|trail; 309 break; 310 case 0: 311 return U_SENTINEL; 312 /* no default branch to optimize switch() - all values are covered */ 313 } 314 315 /* correct sequence - all trail bytes have (b7..b6)==(10)? */ 316 /* illegal is also set if count>=4 */ 317 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) { 318 /* error handling */ 319 /* don't go beyond this sequence */ 320 s=*ps; 321 while(count>0 && U8_IS_TRAIL(*s)) { 322 ++s; 323 --count; 324 } 325 c=U_SENTINEL; 326 } 327 *ps=s; 328 return c; 329 } 330 331 /* 332 * Version of utf8_nextCharSafeBody() with the following differences: 333 * - works with pointers instead of indexes 334 * - always strict (strict==-1) 335 * 336 * *ps points to after the lead byte and will be moved to after the last trail byte. 337 * c is the lead byte. 338 * @return the code point, or U_SENTINEL 339 */ 340 static UChar32 341 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) { 342 const uint8_t *s=*ps; 343 uint8_t trail, illegal=0; 344 uint8_t count=U8_COUNT_TRAIL_BYTES(c); 345 if((limit-s)>=count) { 346 U8_MASK_LEAD_BYTE((c), count); 347 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ 348 switch(count) { 349 /* each branch falls through to the next one */ 350 case 5: 351 case 4: 352 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ 353 illegal=1; 354 break; 355 case 3: 356 trail=*s++; 357 c=(c<<6)|(trail&0x3f); 358 if(c<0x110) { 359 illegal|=(trail&0xc0)^0x80; 360 } else { 361 /* code point>0x10ffff, outside Unicode */ 362 illegal=1; 363 break; 364 } 365 case 2: /*fall through*/ 366 trail=*s++; 367 c=(c<<6)|(trail&0x3f); 368 illegal|=(trail&0xc0)^0x80; 369 case 1: /*fall through*/ 370 trail=*s++; 371 c=(c<<6)|(trail&0x3f); 372 illegal|=(trail&0xc0)^0x80; 373 break; 374 case 0: 375 return U_SENTINEL; 376 /* no default branch to optimize switch() - all values are covered */ 377 } 378 } else { 379 illegal=1; /* too few bytes left */ 380 } 381 382 /* correct sequence - all trail bytes have (b7..b6)==(10)? */ 383 /* illegal is also set if count>=4 */ 384 U_ASSERT(count<sizeof(utf8_minLegal)/sizeof(utf8_minLegal[0])); 385 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) { 386 /* error handling */ 387 /* don't go beyond this sequence */ 388 s=*ps; 389 while(count>0 && s<limit && U8_IS_TRAIL(*s)) { 390 ++s; 391 --count; 392 } 393 c=U_SENTINEL; 394 } 395 *ps=s; 396 return c; 397 } 398 399 U_CAPI UChar* U_EXPORT2 400 u_strFromUTF8WithSub(UChar *dest, 401 int32_t destCapacity, 402 int32_t *pDestLength, 403 const char* src, 404 int32_t srcLength, 405 UChar32 subchar, int32_t *pNumSubstitutions, 406 UErrorCode *pErrorCode){ 407 UChar *pDest = dest; 408 UChar *pDestLimit = dest+destCapacity; 409 UChar32 ch; 410 int32_t reqLength = 0; 411 const uint8_t* pSrc = (const uint8_t*) src; 412 uint8_t t1, t2; /* trail bytes */ 413 int32_t numSubstitutions; 414 415 /* args check */ 416 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 417 return NULL; 418 } 419 420 if( (src==NULL && srcLength!=0) || srcLength < -1 || 421 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 422 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 423 ) { 424 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 425 return NULL; 426 } 427 428 if(pNumSubstitutions!=NULL) { 429 *pNumSubstitutions=0; 430 } 431 numSubstitutions=0; 432 433 /* 434 * Inline processing of UTF-8 byte sequences: 435 * 436 * Byte sequences for the most common characters are handled inline in 437 * the conversion loops. In order to reduce the path lengths for those 438 * characters, the tests are arranged in a kind of binary search. 439 * ASCII (<=0x7f) is checked first, followed by the dividing point 440 * between 2- and 3-byte sequences (0xe0). 441 * The 3-byte branch is tested first to speed up CJK text. 442 * The compiler should combine the subtractions for the two tests for 0xe0. 443 * Each branch then tests for the other end of its range. 444 */ 445 446 if(srcLength < 0){ 447 /* 448 * Transform a NUL-terminated string. 449 * The code explicitly checks for NULs only in the lead byte position. 450 * A NUL byte in the trail byte position fails the trail byte range check anyway. 451 */ 452 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { 453 if(ch <= 0x7f){ 454 *pDest++=(UChar)ch; 455 ++pSrc; 456 } else { 457 if(ch > 0xe0) { 458 if( /* handle U+1000..U+CFFF inline */ 459 ch <= 0xec && 460 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 461 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 462 ) { 463 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 464 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 465 pSrc += 3; 466 continue; 467 } 468 } else if(ch < 0xe0) { 469 if( /* handle U+0080..U+07FF inline */ 470 ch >= 0xc2 && 471 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 472 ) { 473 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 474 pSrc += 2; 475 continue; 476 } 477 } 478 479 /* function call for "complicated" and error cases */ 480 ++pSrc; /* continue after the lead byte */ 481 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); 482 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { 483 *pErrorCode = U_INVALID_CHAR_FOUND; 484 return NULL; 485 } else if(ch<=0xFFFF) { 486 *(pDest++)=(UChar)ch; 487 } else { 488 *(pDest++)=U16_LEAD(ch); 489 if(pDest<pDestLimit) { 490 *(pDest++)=U16_TRAIL(ch); 491 } else { 492 reqLength++; 493 break; 494 } 495 } 496 } 497 } 498 499 /* Pre-flight the rest of the string. */ 500 while((ch = *pSrc) != 0) { 501 if(ch <= 0x7f){ 502 ++reqLength; 503 ++pSrc; 504 } else { 505 if(ch > 0xe0) { 506 if( /* handle U+1000..U+CFFF inline */ 507 ch <= 0xec && 508 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 509 (uint8_t)(pSrc[2] - 0x80) <= 0x3f 510 ) { 511 ++reqLength; 512 pSrc += 3; 513 continue; 514 } 515 } else if(ch < 0xe0) { 516 if( /* handle U+0080..U+07FF inline */ 517 ch >= 0xc2 && 518 (uint8_t)(pSrc[1] - 0x80) <= 0x3f 519 ) { 520 ++reqLength; 521 pSrc += 2; 522 continue; 523 } 524 } 525 526 /* function call for "complicated" and error cases */ 527 ++pSrc; /* continue after the lead byte */ 528 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); 529 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { 530 *pErrorCode = U_INVALID_CHAR_FOUND; 531 return NULL; 532 } 533 reqLength += U16_LENGTH(ch); 534 } 535 } 536 } else /* srcLength >= 0 */ { 537 const uint8_t *pSrcLimit = pSrc + srcLength; 538 int32_t count; 539 540 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 541 for(;;) { 542 /* 543 * Each iteration of the inner loop progresses by at most 3 UTF-8 544 * bytes and one UChar, for most characters. 545 * For supplementary code points (4 & 2), which are rare, 546 * there is an additional adjustment. 547 */ 548 count = (int32_t)(pDestLimit - pDest); 549 srcLength = (int32_t)((pSrcLimit - pSrc) / 3); 550 if(count > srcLength) { 551 count = srcLength; /* min(remaining dest, remaining src/3) */ 552 } 553 if(count < 3) { 554 /* 555 * Too much overhead if we get near the end of the string, 556 * continue with the next loop. 557 */ 558 break; 559 } 560 561 do { 562 ch = *pSrc; 563 if(ch <= 0x7f){ 564 *pDest++=(UChar)ch; 565 ++pSrc; 566 } else { 567 if(ch > 0xe0) { 568 if( /* handle U+1000..U+CFFF inline */ 569 ch <= 0xec && 570 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 571 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 572 ) { 573 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 574 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 575 pSrc += 3; 576 continue; 577 } 578 } else if(ch < 0xe0) { 579 if( /* handle U+0080..U+07FF inline */ 580 ch >= 0xc2 && 581 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 582 ) { 583 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 584 pSrc += 2; 585 continue; 586 } 587 } 588 589 if(ch >= 0xf0 || subchar > 0xffff) { 590 /* 591 * We may read up to six bytes and write up to two UChars, 592 * which we didn't account for with computing count, 593 * so we adjust it here. 594 */ 595 if(--count == 0) { 596 break; 597 } 598 } 599 600 /* function call for "complicated" and error cases */ 601 ++pSrc; /* continue after the lead byte */ 602 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 603 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 604 *pErrorCode = U_INVALID_CHAR_FOUND; 605 return NULL; 606 }else if(ch<=0xFFFF){ 607 *(pDest++)=(UChar)ch; 608 }else{ 609 *(pDest++)=U16_LEAD(ch); 610 *(pDest++)=U16_TRAIL(ch); 611 } 612 } 613 } while(--count > 0); 614 } 615 616 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { 617 ch = *pSrc; 618 if(ch <= 0x7f){ 619 *pDest++=(UChar)ch; 620 ++pSrc; 621 } else { 622 if(ch > 0xe0) { 623 if( /* handle U+1000..U+CFFF inline */ 624 ch <= 0xec && 625 ((pSrcLimit - pSrc) >= 3) && 626 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 627 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 628 ) { 629 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 630 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 631 pSrc += 3; 632 continue; 633 } 634 } else if(ch < 0xe0) { 635 if( /* handle U+0080..U+07FF inline */ 636 ch >= 0xc2 && 637 ((pSrcLimit - pSrc) >= 2) && 638 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 639 ) { 640 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 641 pSrc += 2; 642 continue; 643 } 644 } 645 646 /* function call for "complicated" and error cases */ 647 ++pSrc; /* continue after the lead byte */ 648 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 649 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 650 *pErrorCode = U_INVALID_CHAR_FOUND; 651 return NULL; 652 }else if(ch<=0xFFFF){ 653 *(pDest++)=(UChar)ch; 654 }else{ 655 *(pDest++)=U16_LEAD(ch); 656 if(pDest<pDestLimit){ 657 *(pDest++)=U16_TRAIL(ch); 658 }else{ 659 reqLength++; 660 break; 661 } 662 } 663 } 664 } 665 /* do not fill the dest buffer just count the UChars needed */ 666 while(pSrc < pSrcLimit){ 667 ch = *pSrc; 668 if(ch <= 0x7f){ 669 reqLength++; 670 ++pSrc; 671 } else { 672 if(ch > 0xe0) { 673 if( /* handle U+1000..U+CFFF inline */ 674 ch <= 0xec && 675 ((pSrcLimit - pSrc) >= 3) && 676 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 677 (uint8_t)(pSrc[2] - 0x80) <= 0x3f 678 ) { 679 reqLength++; 680 pSrc += 3; 681 continue; 682 } 683 } else if(ch < 0xe0) { 684 if( /* handle U+0080..U+07FF inline */ 685 ch >= 0xc2 && 686 ((pSrcLimit - pSrc) >= 2) && 687 (uint8_t)(pSrc[1] - 0x80) <= 0x3f 688 ) { 689 reqLength++; 690 pSrc += 2; 691 continue; 692 } 693 } 694 695 /* function call for "complicated" and error cases */ 696 ++pSrc; /* continue after the lead byte */ 697 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 698 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 699 *pErrorCode = U_INVALID_CHAR_FOUND; 700 return NULL; 701 } 702 reqLength+=U16_LENGTH(ch); 703 } 704 } 705 } 706 707 reqLength+=(int32_t)(pDest - dest); 708 709 if(pNumSubstitutions!=NULL) { 710 *pNumSubstitutions=numSubstitutions; 711 } 712 713 if(pDestLength){ 714 *pDestLength = reqLength; 715 } 716 717 /* Terminate the buffer */ 718 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 719 720 return dest; 721 } 722 723 U_CAPI UChar* U_EXPORT2 724 u_strFromUTF8(UChar *dest, 725 int32_t destCapacity, 726 int32_t *pDestLength, 727 const char* src, 728 int32_t srcLength, 729 UErrorCode *pErrorCode){ 730 return u_strFromUTF8WithSub( 731 dest, destCapacity, pDestLength, 732 src, srcLength, 733 U_SENTINEL, NULL, 734 pErrorCode); 735 } 736 737 U_CAPI UChar * U_EXPORT2 738 u_strFromUTF8Lenient(UChar *dest, 739 int32_t destCapacity, 740 int32_t *pDestLength, 741 const char *src, 742 int32_t srcLength, 743 UErrorCode *pErrorCode) { 744 UChar *pDest = dest; 745 UChar32 ch; 746 int32_t reqLength = 0; 747 uint8_t* pSrc = (uint8_t*) src; 748 749 /* args check */ 750 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 751 return NULL; 752 } 753 754 if( (src==NULL && srcLength!=0) || srcLength < -1 || 755 (destCapacity<0) || (dest == NULL && destCapacity > 0) 756 ) { 757 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 758 return NULL; 759 } 760 761 if(srcLength < 0) { 762 /* Transform a NUL-terminated string. */ 763 UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL; 764 uint8_t t1, t2, t3; /* trail bytes */ 765 766 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { 767 if(ch < 0xc0) { 768 /* 769 * ASCII, or a trail byte in lead position which is treated like 770 * a single-byte sequence for better character boundary 771 * resynchronization after illegal sequences. 772 */ 773 *pDest++=(UChar)ch; 774 ++pSrc; 775 continue; 776 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 777 if((t1 = pSrc[1]) != 0) { 778 /* 0x3080 = (0xc0 << 6) + 0x80 */ 779 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080); 780 pSrc += 2; 781 continue; 782 } 783 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 784 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) { 785 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 786 /* 0x2080 = (0x80 << 6) + 0x80 */ 787 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080); 788 pSrc += 3; 789 continue; 790 } 791 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 792 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) { 793 pSrc += 4; 794 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 795 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080; 796 *(pDest++) = U16_LEAD(ch); 797 if(pDest < pDestLimit) { 798 *(pDest++) = U16_TRAIL(ch); 799 } else { 800 reqLength = 1; 801 break; 802 } 803 continue; 804 } 805 } 806 807 /* truncated character at the end */ 808 *pDest++ = 0xfffd; 809 while(*++pSrc != 0) {} 810 break; 811 } 812 813 /* Pre-flight the rest of the string. */ 814 while((ch = *pSrc) != 0) { 815 if(ch < 0xc0) { 816 /* 817 * ASCII, or a trail byte in lead position which is treated like 818 * a single-byte sequence for better character boundary 819 * resynchronization after illegal sequences. 820 */ 821 ++reqLength; 822 ++pSrc; 823 continue; 824 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 825 if(pSrc[1] != 0) { 826 ++reqLength; 827 pSrc += 2; 828 continue; 829 } 830 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 831 if(pSrc[1] != 0 && pSrc[2] != 0) { 832 ++reqLength; 833 pSrc += 3; 834 continue; 835 } 836 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 837 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) { 838 reqLength += 2; 839 pSrc += 4; 840 continue; 841 } 842 } 843 844 /* truncated character at the end */ 845 ++reqLength; 846 break; 847 } 848 } else /* srcLength >= 0 */ { 849 const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL; 850 851 /* 852 * This function requires that if srcLength is given, then it must be 853 * destCapatity >= srcLength so that we need not check for 854 * destination buffer overflow in the loop. 855 */ 856 if(destCapacity < srcLength) { 857 if(pDestLength != NULL) { 858 *pDestLength = srcLength; /* this likely overestimates the true destLength! */ 859 } 860 *pErrorCode = U_BUFFER_OVERFLOW_ERROR; 861 return NULL; 862 } 863 864 if((pSrcLimit - pSrc) >= 4) { 865 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */ 866 867 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */ 868 do { 869 ch = *pSrc++; 870 if(ch < 0xc0) { 871 /* 872 * ASCII, or a trail byte in lead position which is treated like 873 * a single-byte sequence for better character boundary 874 * resynchronization after illegal sequences. 875 */ 876 *pDest++=(UChar)ch; 877 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 878 /* 0x3080 = (0xc0 << 6) + 0x80 */ 879 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); 880 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 881 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 882 /* 0x2080 = (0x80 << 6) + 0x80 */ 883 ch = (ch << 12) + (*pSrc++ << 6); 884 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); 885 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 886 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 887 ch = (ch << 18) + (*pSrc++ << 12); 888 ch += *pSrc++ << 6; 889 ch += *pSrc++ - 0x3c82080; 890 *(pDest++) = U16_LEAD(ch); 891 *(pDest++) = U16_TRAIL(ch); 892 } 893 } while(pSrc < pSrcLimit); 894 895 pSrcLimit += 3; /* restore original pSrcLimit */ 896 } 897 898 while(pSrc < pSrcLimit) { 899 ch = *pSrc++; 900 if(ch < 0xc0) { 901 /* 902 * ASCII, or a trail byte in lead position which is treated like 903 * a single-byte sequence for better character boundary 904 * resynchronization after illegal sequences. 905 */ 906 *pDest++=(UChar)ch; 907 continue; 908 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 909 if(pSrc < pSrcLimit) { 910 /* 0x3080 = (0xc0 << 6) + 0x80 */ 911 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); 912 continue; 913 } 914 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 915 if((pSrcLimit - pSrc) >= 2) { 916 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 917 /* 0x2080 = (0x80 << 6) + 0x80 */ 918 ch = (ch << 12) + (*pSrc++ << 6); 919 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); 920 pSrc += 3; 921 continue; 922 } 923 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 924 if((pSrcLimit - pSrc) >= 3) { 925 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 926 ch = (ch << 18) + (*pSrc++ << 12); 927 ch += *pSrc++ << 6; 928 ch += *pSrc++ - 0x3c82080; 929 *(pDest++) = U16_LEAD(ch); 930 *(pDest++) = U16_TRAIL(ch); 931 pSrc += 4; 932 continue; 933 } 934 } 935 936 /* truncated character at the end */ 937 *pDest++ = 0xfffd; 938 break; 939 } 940 } 941 942 reqLength+=(int32_t)(pDest - dest); 943 944 if(pDestLength){ 945 *pDestLength = reqLength; 946 } 947 948 /* Terminate the buffer */ 949 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 950 951 return dest; 952 } 953 954 static inline uint8_t * 955 _appendUTF8(uint8_t *pDest, UChar32 c) { 956 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */ 957 if((c)<=0x7f) { 958 *pDest++=(uint8_t)c; 959 } else if(c<=0x7ff) { 960 *pDest++=(uint8_t)((c>>6)|0xc0); 961 *pDest++=(uint8_t)((c&0x3f)|0x80); 962 } else if(c<=0xffff) { 963 *pDest++=(uint8_t)((c>>12)|0xe0); 964 *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80); 965 *pDest++=(uint8_t)(((c)&0x3f)|0x80); 966 } else /* if((uint32_t)(c)<=0x10ffff) */ { 967 *pDest++=(uint8_t)(((c)>>18)|0xf0); 968 *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80); 969 *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80); 970 *pDest++=(uint8_t)(((c)&0x3f)|0x80); 971 } 972 return pDest; 973 } 974 975 976 U_CAPI char* U_EXPORT2 977 u_strToUTF8WithSub(char *dest, 978 int32_t destCapacity, 979 int32_t *pDestLength, 980 const UChar *pSrc, 981 int32_t srcLength, 982 UChar32 subchar, int32_t *pNumSubstitutions, 983 UErrorCode *pErrorCode){ 984 int32_t reqLength=0; 985 uint32_t ch=0,ch2=0; 986 uint8_t *pDest = (uint8_t *)dest; 987 uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL; 988 int32_t numSubstitutions; 989 990 /* args check */ 991 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 992 return NULL; 993 } 994 995 if( (pSrc==NULL && srcLength!=0) || srcLength < -1 || 996 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 997 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 998 ) { 999 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1000 return NULL; 1001 } 1002 1003 if(pNumSubstitutions!=NULL) { 1004 *pNumSubstitutions=0; 1005 } 1006 numSubstitutions=0; 1007 1008 if(srcLength==-1) { 1009 while((ch=*pSrc)!=0) { 1010 ++pSrc; 1011 if(ch <= 0x7f) { 1012 if(pDest<pDestLimit) { 1013 *pDest++ = (uint8_t)ch; 1014 } else { 1015 reqLength = 1; 1016 break; 1017 } 1018 } else if(ch <= 0x7ff) { 1019 if((pDestLimit - pDest) >= 2) { 1020 *pDest++=(uint8_t)((ch>>6)|0xc0); 1021 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1022 } else { 1023 reqLength = 2; 1024 break; 1025 } 1026 } else if(ch <= 0xd7ff || ch >= 0xe000) { 1027 if((pDestLimit - pDest) >= 3) { 1028 *pDest++=(uint8_t)((ch>>12)|0xe0); 1029 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1030 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1031 } else { 1032 reqLength = 3; 1033 break; 1034 } 1035 } else /* ch is a surrogate */ { 1036 int32_t length; 1037 1038 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/ 1039 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 1040 ++pSrc; 1041 ch=U16_GET_SUPPLEMENTARY(ch, ch2); 1042 } else if(subchar>=0) { 1043 ch=subchar; 1044 ++numSubstitutions; 1045 } else { 1046 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1047 *pErrorCode = U_INVALID_CHAR_FOUND; 1048 return NULL; 1049 } 1050 1051 length = U8_LENGTH(ch); 1052 if((pDestLimit - pDest) >= length) { 1053 /* convert and append*/ 1054 pDest=_appendUTF8(pDest, ch); 1055 } else { 1056 reqLength = length; 1057 break; 1058 } 1059 } 1060 } 1061 while((ch=*pSrc++)!=0) { 1062 if(ch<=0x7f) { 1063 ++reqLength; 1064 } else if(ch<=0x7ff) { 1065 reqLength+=2; 1066 } else if(!U16_IS_SURROGATE(ch)) { 1067 reqLength+=3; 1068 } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 1069 ++pSrc; 1070 reqLength+=4; 1071 } else if(subchar>=0) { 1072 reqLength+=U8_LENGTH(subchar); 1073 ++numSubstitutions; 1074 } else { 1075 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1076 *pErrorCode = U_INVALID_CHAR_FOUND; 1077 return NULL; 1078 } 1079 } 1080 } else { 1081 const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL; 1082 int32_t count; 1083 1084 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1085 for(;;) { 1086 /* 1087 * Each iteration of the inner loop progresses by at most 3 UTF-8 1088 * bytes and one UChar, for most characters. 1089 * For supplementary code points (4 & 2), which are rare, 1090 * there is an additional adjustment. 1091 */ 1092 count = (int32_t)((pDestLimit - pDest) / 3); 1093 srcLength = (int32_t)(pSrcLimit - pSrc); 1094 if(count > srcLength) { 1095 count = srcLength; /* min(remaining dest/3, remaining src) */ 1096 } 1097 if(count < 3) { 1098 /* 1099 * Too much overhead if we get near the end of the string, 1100 * continue with the next loop. 1101 */ 1102 break; 1103 } 1104 do { 1105 ch=*pSrc++; 1106 if(ch <= 0x7f) { 1107 *pDest++ = (uint8_t)ch; 1108 } else if(ch <= 0x7ff) { 1109 *pDest++=(uint8_t)((ch>>6)|0xc0); 1110 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1111 } else if(ch <= 0xd7ff || ch >= 0xe000) { 1112 *pDest++=(uint8_t)((ch>>12)|0xe0); 1113 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1114 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1115 } else /* ch is a surrogate */ { 1116 /* 1117 * We will read two UChars and probably output four bytes, 1118 * which we didn't account for with computing count, 1119 * so we adjust it here. 1120 */ 1121 if(--count == 0) { 1122 --pSrc; /* undo ch=*pSrc++ for the lead surrogate */ 1123 break; /* recompute count */ 1124 } 1125 1126 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 1127 ++pSrc; 1128 ch=U16_GET_SUPPLEMENTARY(ch, ch2); 1129 1130 /* writing 4 bytes per 2 UChars is ok */ 1131 *pDest++=(uint8_t)((ch>>18)|0xf0); 1132 *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80); 1133 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1134 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1135 } else { 1136 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1137 if(subchar>=0) { 1138 ch=subchar; 1139 ++numSubstitutions; 1140 } else { 1141 *pErrorCode = U_INVALID_CHAR_FOUND; 1142 return NULL; 1143 } 1144 1145 /* convert and append*/ 1146 pDest=_appendUTF8(pDest, ch); 1147 } 1148 } 1149 } while(--count > 0); 1150 } 1151 1152 while(pSrc<pSrcLimit) { 1153 ch=*pSrc++; 1154 if(ch <= 0x7f) { 1155 if(pDest<pDestLimit) { 1156 *pDest++ = (uint8_t)ch; 1157 } else { 1158 reqLength = 1; 1159 break; 1160 } 1161 } else if(ch <= 0x7ff) { 1162 if((pDestLimit - pDest) >= 2) { 1163 *pDest++=(uint8_t)((ch>>6)|0xc0); 1164 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1165 } else { 1166 reqLength = 2; 1167 break; 1168 } 1169 } else if(ch <= 0xd7ff || ch >= 0xe000) { 1170 if((pDestLimit - pDest) >= 3) { 1171 *pDest++=(uint8_t)((ch>>12)|0xe0); 1172 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1173 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1174 } else { 1175 reqLength = 3; 1176 break; 1177 } 1178 } else /* ch is a surrogate */ { 1179 int32_t length; 1180 1181 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { 1182 ++pSrc; 1183 ch=U16_GET_SUPPLEMENTARY(ch, ch2); 1184 } else if(subchar>=0) { 1185 ch=subchar; 1186 ++numSubstitutions; 1187 } else { 1188 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1189 *pErrorCode = U_INVALID_CHAR_FOUND; 1190 return NULL; 1191 } 1192 1193 length = U8_LENGTH(ch); 1194 if((pDestLimit - pDest) >= length) { 1195 /* convert and append*/ 1196 pDest=_appendUTF8(pDest, ch); 1197 } else { 1198 reqLength = length; 1199 break; 1200 } 1201 } 1202 } 1203 while(pSrc<pSrcLimit) { 1204 ch=*pSrc++; 1205 if(ch<=0x7f) { 1206 ++reqLength; 1207 } else if(ch<=0x7ff) { 1208 reqLength+=2; 1209 } else if(!U16_IS_SURROGATE(ch)) { 1210 reqLength+=3; 1211 } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { 1212 ++pSrc; 1213 reqLength+=4; 1214 } else if(subchar>=0) { 1215 reqLength+=U8_LENGTH(subchar); 1216 ++numSubstitutions; 1217 } else { 1218 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1219 *pErrorCode = U_INVALID_CHAR_FOUND; 1220 return NULL; 1221 } 1222 } 1223 } 1224 1225 reqLength+=(int32_t)(pDest - (uint8_t *)dest); 1226 1227 if(pNumSubstitutions!=NULL) { 1228 *pNumSubstitutions=numSubstitutions; 1229 } 1230 1231 if(pDestLength){ 1232 *pDestLength = reqLength; 1233 } 1234 1235 /* Terminate the buffer */ 1236 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1237 return dest; 1238 } 1239 1240 U_CAPI char* U_EXPORT2 1241 u_strToUTF8(char *dest, 1242 int32_t destCapacity, 1243 int32_t *pDestLength, 1244 const UChar *pSrc, 1245 int32_t srcLength, 1246 UErrorCode *pErrorCode){ 1247 return u_strToUTF8WithSub( 1248 dest, destCapacity, pDestLength, 1249 pSrc, srcLength, 1250 U_SENTINEL, NULL, 1251 pErrorCode); 1252 } 1253 1254 U_CAPI UChar* U_EXPORT2 1255 u_strFromJavaModifiedUTF8WithSub( 1256 UChar *dest, 1257 int32_t destCapacity, 1258 int32_t *pDestLength, 1259 const char *src, 1260 int32_t srcLength, 1261 UChar32 subchar, int32_t *pNumSubstitutions, 1262 UErrorCode *pErrorCode) { 1263 UChar *pDest = dest; 1264 UChar *pDestLimit = dest+destCapacity; 1265 UChar32 ch; 1266 int32_t reqLength = 0; 1267 const uint8_t* pSrc = (const uint8_t*) src; 1268 const uint8_t *pSrcLimit; 1269 int32_t count; 1270 uint8_t t1, t2; /* trail bytes */ 1271 int32_t numSubstitutions; 1272 1273 /* args check */ 1274 if(U_FAILURE(*pErrorCode)){ 1275 return NULL; 1276 } 1277 if( (src==NULL && srcLength!=0) || srcLength < -1 || 1278 (dest==NULL && destCapacity!=0) || destCapacity<0 || 1279 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 1280 ) { 1281 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1282 return NULL; 1283 } 1284 1285 if(pNumSubstitutions!=NULL) { 1286 *pNumSubstitutions=0; 1287 } 1288 numSubstitutions=0; 1289 1290 if(srcLength < 0) { 1291 /* 1292 * Transform a NUL-terminated ASCII string. 1293 * Handle non-ASCII strings with slower code. 1294 */ 1295 while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) { 1296 *pDest++=(UChar)ch; 1297 ++pSrc; 1298 } 1299 if(ch == 0) { 1300 reqLength=(int32_t)(pDest - dest); 1301 if(pDestLength) { 1302 *pDestLength = reqLength; 1303 } 1304 1305 /* Terminate the buffer */ 1306 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 1307 return dest; 1308 } 1309 srcLength = uprv_strlen((const char *)pSrc); 1310 } 1311 1312 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1313 pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength; 1314 for(;;) { 1315 count = (int32_t)(pDestLimit - pDest); 1316 srcLength = (int32_t)(pSrcLimit - pSrc); 1317 if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) { 1318 /* fast ASCII loop */ 1319 const uint8_t *prevSrc = pSrc; 1320 int32_t delta; 1321 while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) { 1322 *pDest++=(UChar)ch; 1323 ++pSrc; 1324 } 1325 delta = (int32_t)(pSrc - prevSrc); 1326 count -= delta; 1327 srcLength -= delta; 1328 } 1329 /* 1330 * Each iteration of the inner loop progresses by at most 3 UTF-8 1331 * bytes and one UChar. 1332 */ 1333 srcLength /= 3; 1334 if(count > srcLength) { 1335 count = srcLength; /* min(remaining dest, remaining src/3) */ 1336 } 1337 if(count < 3) { 1338 /* 1339 * Too much overhead if we get near the end of the string, 1340 * continue with the next loop. 1341 */ 1342 break; 1343 } 1344 do { 1345 ch = *pSrc; 1346 if(ch <= 0x7f){ 1347 *pDest++=(UChar)ch; 1348 ++pSrc; 1349 } else { 1350 if(ch >= 0xe0) { 1351 if( /* handle U+0000..U+FFFF inline */ 1352 ch <= 0xef && 1353 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 1354 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 1355 ) { 1356 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1357 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 1358 pSrc += 3; 1359 continue; 1360 } 1361 } else { 1362 if( /* handle U+0000..U+07FF inline */ 1363 ch >= 0xc0 && 1364 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 1365 ) { 1366 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 1367 pSrc += 2; 1368 continue; 1369 } 1370 } 1371 1372 if(subchar < 0) { 1373 *pErrorCode = U_INVALID_CHAR_FOUND; 1374 return NULL; 1375 } else if(subchar > 0xffff && --count == 0) { 1376 /* 1377 * We need to write two UChars, adjusted count for that, 1378 * and ran out of space. 1379 */ 1380 break; 1381 } else { 1382 /* function call for error cases */ 1383 ++pSrc; /* continue after the lead byte */ 1384 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1385 ++numSubstitutions; 1386 if(subchar<=0xFFFF) { 1387 *(pDest++)=(UChar)subchar; 1388 } else { 1389 *(pDest++)=U16_LEAD(subchar); 1390 *(pDest++)=U16_TRAIL(subchar); 1391 } 1392 } 1393 } 1394 } while(--count > 0); 1395 } 1396 1397 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { 1398 ch = *pSrc; 1399 if(ch <= 0x7f){ 1400 *pDest++=(UChar)ch; 1401 ++pSrc; 1402 } else { 1403 if(ch >= 0xe0) { 1404 if( /* handle U+0000..U+FFFF inline */ 1405 ch <= 0xef && 1406 ((pSrcLimit - pSrc) >= 3) && 1407 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 1408 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 1409 ) { 1410 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1411 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 1412 pSrc += 3; 1413 continue; 1414 } 1415 } else { 1416 if( /* handle U+0000..U+07FF inline */ 1417 ch >= 0xc0 && 1418 ((pSrcLimit - pSrc) >= 2) && 1419 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 1420 ) { 1421 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 1422 pSrc += 2; 1423 continue; 1424 } 1425 } 1426 1427 if(subchar < 0) { 1428 *pErrorCode = U_INVALID_CHAR_FOUND; 1429 return NULL; 1430 } else { 1431 /* function call for error cases */ 1432 ++pSrc; /* continue after the lead byte */ 1433 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1434 ++numSubstitutions; 1435 if(subchar<=0xFFFF) { 1436 *(pDest++)=(UChar)subchar; 1437 } else { 1438 *(pDest++)=U16_LEAD(subchar); 1439 if(pDest<pDestLimit) { 1440 *(pDest++)=U16_TRAIL(subchar); 1441 } else { 1442 reqLength++; 1443 break; 1444 } 1445 } 1446 } 1447 } 1448 } 1449 1450 /* do not fill the dest buffer just count the UChars needed */ 1451 while(pSrc < pSrcLimit){ 1452 ch = *pSrc; 1453 if(ch <= 0x7f) { 1454 reqLength++; 1455 ++pSrc; 1456 } else { 1457 if(ch >= 0xe0) { 1458 if( /* handle U+0000..U+FFFF inline */ 1459 ch <= 0xef && 1460 ((pSrcLimit - pSrc) >= 3) && 1461 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 1462 (uint8_t)(pSrc[2] - 0x80) <= 0x3f 1463 ) { 1464 reqLength++; 1465 pSrc += 3; 1466 continue; 1467 } 1468 } else { 1469 if( /* handle U+0000..U+07FF inline */ 1470 ch >= 0xc0 && 1471 ((pSrcLimit - pSrc) >= 2) && 1472 (uint8_t)(pSrc[1] - 0x80) <= 0x3f 1473 ) { 1474 reqLength++; 1475 pSrc += 2; 1476 continue; 1477 } 1478 } 1479 1480 if(subchar < 0) { 1481 *pErrorCode = U_INVALID_CHAR_FOUND; 1482 return NULL; 1483 } else { 1484 /* function call for error cases */ 1485 ++pSrc; /* continue after the lead byte */ 1486 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1487 ++numSubstitutions; 1488 reqLength+=U16_LENGTH(ch); 1489 } 1490 } 1491 } 1492 1493 if(pNumSubstitutions!=NULL) { 1494 *pNumSubstitutions=numSubstitutions; 1495 } 1496 1497 reqLength+=(int32_t)(pDest - dest); 1498 if(pDestLength) { 1499 *pDestLength = reqLength; 1500 } 1501 1502 /* Terminate the buffer */ 1503 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 1504 return dest; 1505 } 1506 1507 U_CAPI char* U_EXPORT2 1508 u_strToJavaModifiedUTF8( 1509 char *dest, 1510 int32_t destCapacity, 1511 int32_t *pDestLength, 1512 const UChar *src, 1513 int32_t srcLength, 1514 UErrorCode *pErrorCode) { 1515 int32_t reqLength=0; 1516 uint32_t ch=0; 1517 uint8_t *pDest = (uint8_t *)dest; 1518 uint8_t *pDestLimit = pDest + destCapacity; 1519 const UChar *pSrcLimit; 1520 int32_t count; 1521 1522 /* args check */ 1523 if(U_FAILURE(*pErrorCode)){ 1524 return NULL; 1525 } 1526 if( (src==NULL && srcLength!=0) || srcLength < -1 || 1527 (dest==NULL && destCapacity!=0) || destCapacity<0 1528 ) { 1529 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1530 return NULL; 1531 } 1532 1533 if(srcLength==-1) { 1534 /* Convert NUL-terminated ASCII, then find the string length. */ 1535 while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) { 1536 *pDest++ = (uint8_t)ch; 1537 ++src; 1538 } 1539 if(ch == 0) { 1540 reqLength=(int32_t)(pDest - (uint8_t *)dest); 1541 if(pDestLength) { 1542 *pDestLength = reqLength; 1543 } 1544 1545 /* Terminate the buffer */ 1546 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1547 return dest; 1548 } 1549 srcLength = u_strlen(src); 1550 } 1551 1552 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1553 pSrcLimit = (src!=NULL)?(src+srcLength):NULL; 1554 for(;;) { 1555 count = (int32_t)(pDestLimit - pDest); 1556 srcLength = (int32_t)(pSrcLimit - src); 1557 if(count >= srcLength && srcLength > 0 && *src <= 0x7f) { 1558 /* fast ASCII loop */ 1559 const UChar *prevSrc = src; 1560 int32_t delta; 1561 while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) { 1562 *pDest++=(uint8_t)ch; 1563 ++src; 1564 } 1565 delta = (int32_t)(src - prevSrc); 1566 count -= delta; 1567 srcLength -= delta; 1568 } 1569 /* 1570 * Each iteration of the inner loop progresses by at most 3 UTF-8 1571 * bytes and one UChar. 1572 */ 1573 count /= 3; 1574 if(count > srcLength) { 1575 count = srcLength; /* min(remaining dest/3, remaining src) */ 1576 } 1577 if(count < 3) { 1578 /* 1579 * Too much overhead if we get near the end of the string, 1580 * continue with the next loop. 1581 */ 1582 break; 1583 } 1584 do { 1585 ch=*src++; 1586 if(ch <= 0x7f && ch != 0) { 1587 *pDest++ = (uint8_t)ch; 1588 } else if(ch <= 0x7ff) { 1589 *pDest++=(uint8_t)((ch>>6)|0xc0); 1590 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1591 } else { 1592 *pDest++=(uint8_t)((ch>>12)|0xe0); 1593 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1594 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1595 } 1596 } while(--count > 0); 1597 } 1598 1599 while(src<pSrcLimit) { 1600 ch=*src++; 1601 if(ch <= 0x7f && ch != 0) { 1602 if(pDest<pDestLimit) { 1603 *pDest++ = (uint8_t)ch; 1604 } else { 1605 reqLength = 1; 1606 break; 1607 } 1608 } else if(ch <= 0x7ff) { 1609 if((pDestLimit - pDest) >= 2) { 1610 *pDest++=(uint8_t)((ch>>6)|0xc0); 1611 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1612 } else { 1613 reqLength = 2; 1614 break; 1615 } 1616 } else { 1617 if((pDestLimit - pDest) >= 3) { 1618 *pDest++=(uint8_t)((ch>>12)|0xe0); 1619 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1620 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1621 } else { 1622 reqLength = 3; 1623 break; 1624 } 1625 } 1626 } 1627 while(src<pSrcLimit) { 1628 ch=*src++; 1629 if(ch <= 0x7f && ch != 0) { 1630 ++reqLength; 1631 } else if(ch<=0x7ff) { 1632 reqLength+=2; 1633 } else { 1634 reqLength+=3; 1635 } 1636 } 1637 1638 reqLength+=(int32_t)(pDest - (uint8_t *)dest); 1639 if(pDestLength){ 1640 *pDestLength = reqLength; 1641 } 1642 1643 /* Terminate the buffer */ 1644 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1645 return dest; 1646 } 1647