1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 2001-2013, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * 9 * File ustrtrns.cpp 10 * 11 * Modification History: 12 * 13 * Date Name Description 14 * 9/10/2001 Ram Creation. 15 ****************************************************************************** 16 */ 17 18 /******************************************************************************* 19 * 20 * u_strTo* and u_strFrom* APIs 21 * WCS functions moved to ustr_wcs.c for better modularization 22 * 23 ******************************************************************************* 24 */ 25 26 27 #include "unicode/putil.h" 28 #include "unicode/ustring.h" 29 #include "unicode/utf.h" 30 #include "unicode/utf8.h" 31 #include "unicode/utf16.h" 32 #include "cstring.h" 33 #include "cmemory.h" 34 #include "ustr_imp.h" 35 #include "uassert.h" 36 37 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 38 39 U_CAPI UChar* U_EXPORT2 40 u_strFromUTF32WithSub(UChar *dest, 41 int32_t destCapacity, 42 int32_t *pDestLength, 43 const UChar32 *src, 44 int32_t srcLength, 45 UChar32 subchar, int32_t *pNumSubstitutions, 46 UErrorCode *pErrorCode) { 47 const UChar32 *srcLimit; 48 UChar32 ch; 49 UChar *destLimit; 50 UChar *pDest; 51 int32_t reqLength; 52 int32_t numSubstitutions; 53 54 /* args check */ 55 if(U_FAILURE(*pErrorCode)){ 56 return NULL; 57 } 58 if( (src==NULL && srcLength!=0) || srcLength < -1 || 59 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 60 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 61 ) { 62 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 63 return NULL; 64 } 65 66 if(pNumSubstitutions != NULL) { 67 *pNumSubstitutions = 0; 68 } 69 70 pDest = dest; 71 destLimit = (dest!=NULL)?(dest + destCapacity):NULL; 72 reqLength = 0; 73 numSubstitutions = 0; 74 75 if(srcLength < 0) { 76 /* simple loop for conversion of a NUL-terminated BMP string */ 77 while((ch=*src) != 0 && 78 ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) { 79 ++src; 80 if(pDest < destLimit) { 81 *pDest++ = (UChar)ch; 82 } else { 83 ++reqLength; 84 } 85 } 86 srcLimit = src; 87 if(ch != 0) { 88 /* "complicated" case, find the end of the remaining string */ 89 while(*++srcLimit != 0) {} 90 } 91 } else { 92 srcLimit = (src!=NULL)?(src + srcLength):NULL; 93 } 94 95 /* convert with length */ 96 while(src < srcLimit) { 97 ch = *src++; 98 do { 99 /* usually "loops" once; twice only for writing subchar */ 100 if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) { 101 if(pDest < destLimit) { 102 *pDest++ = (UChar)ch; 103 } else { 104 ++reqLength; 105 } 106 break; 107 } else if(0x10000 <= ch && ch <= 0x10ffff) { 108 if(pDest!=NULL && ((pDest + 2) <= destLimit)) { 109 *pDest++ = U16_LEAD(ch); 110 *pDest++ = U16_TRAIL(ch); 111 } else { 112 reqLength += 2; 113 } 114 break; 115 } else if((ch = subchar) < 0) { 116 /* surrogate code point, or not a Unicode code point at all */ 117 *pErrorCode = U_INVALID_CHAR_FOUND; 118 return NULL; 119 } else { 120 ++numSubstitutions; 121 } 122 } while(TRUE); 123 } 124 125 reqLength += (int32_t)(pDest - dest); 126 if(pDestLength) { 127 *pDestLength = reqLength; 128 } 129 if(pNumSubstitutions != NULL) { 130 *pNumSubstitutions = numSubstitutions; 131 } 132 133 /* Terminate the buffer */ 134 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 135 136 return dest; 137 } 138 139 U_CAPI UChar* U_EXPORT2 140 u_strFromUTF32(UChar *dest, 141 int32_t destCapacity, 142 int32_t *pDestLength, 143 const UChar32 *src, 144 int32_t srcLength, 145 UErrorCode *pErrorCode) { 146 return u_strFromUTF32WithSub( 147 dest, destCapacity, pDestLength, 148 src, srcLength, 149 U_SENTINEL, NULL, 150 pErrorCode); 151 } 152 153 U_CAPI UChar32* U_EXPORT2 154 u_strToUTF32WithSub(UChar32 *dest, 155 int32_t destCapacity, 156 int32_t *pDestLength, 157 const UChar *src, 158 int32_t srcLength, 159 UChar32 subchar, int32_t *pNumSubstitutions, 160 UErrorCode *pErrorCode) { 161 const UChar *srcLimit; 162 UChar32 ch; 163 UChar ch2; 164 UChar32 *destLimit; 165 UChar32 *pDest; 166 int32_t reqLength; 167 int32_t numSubstitutions; 168 169 /* args check */ 170 if(U_FAILURE(*pErrorCode)){ 171 return NULL; 172 } 173 if( (src==NULL && srcLength!=0) || srcLength < -1 || 174 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 175 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 176 ) { 177 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 178 return NULL; 179 } 180 181 if(pNumSubstitutions != NULL) { 182 *pNumSubstitutions = 0; 183 } 184 185 pDest = dest; 186 destLimit = (dest!=NULL)?(dest + destCapacity):NULL; 187 reqLength = 0; 188 numSubstitutions = 0; 189 190 if(srcLength < 0) { 191 /* simple loop for conversion of a NUL-terminated BMP string */ 192 while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) { 193 ++src; 194 if(pDest < destLimit) { 195 *pDest++ = ch; 196 } else { 197 ++reqLength; 198 } 199 } 200 srcLimit = src; 201 if(ch != 0) { 202 /* "complicated" case, find the end of the remaining string */ 203 while(*++srcLimit != 0) {} 204 } 205 } else { 206 srcLimit = (src!=NULL)?(src + srcLength):NULL; 207 } 208 209 /* convert with length */ 210 while(src < srcLimit) { 211 ch = *src++; 212 if(!U16_IS_SURROGATE(ch)) { 213 /* write or count ch below */ 214 } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) { 215 ++src; 216 ch = U16_GET_SUPPLEMENTARY(ch, ch2); 217 } else if((ch = subchar) < 0) { 218 /* unpaired surrogate */ 219 *pErrorCode = U_INVALID_CHAR_FOUND; 220 return NULL; 221 } else { 222 ++numSubstitutions; 223 } 224 if(pDest < destLimit) { 225 *pDest++ = ch; 226 } else { 227 ++reqLength; 228 } 229 } 230 231 reqLength += (int32_t)(pDest - dest); 232 if(pDestLength) { 233 *pDestLength = reqLength; 234 } 235 if(pNumSubstitutions != NULL) { 236 *pNumSubstitutions = numSubstitutions; 237 } 238 239 /* Terminate the buffer */ 240 u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode); 241 242 return dest; 243 } 244 245 U_CAPI UChar32* U_EXPORT2 246 u_strToUTF32(UChar32 *dest, 247 int32_t destCapacity, 248 int32_t *pDestLength, 249 const UChar *src, 250 int32_t srcLength, 251 UErrorCode *pErrorCode) { 252 return u_strToUTF32WithSub( 253 dest, destCapacity, pDestLength, 254 src, srcLength, 255 U_SENTINEL, NULL, 256 pErrorCode); 257 } 258 259 /* for utf8_nextCharSafeBodyTerminated() */ 260 static const UChar32 261 utf8_minLegal[4]={ 0, 0x80, 0x800, 0x10000 }; 262 263 /* 264 * Version of utf8_nextCharSafeBody() with the following differences: 265 * - checks for NUL termination instead of length 266 * - works with pointers instead of indexes 267 * - always strict (strict==-1) 268 * 269 * *ps points to after the lead byte and will be moved to after the last trail byte. 270 * c is the lead byte. 271 * @return the code point, or U_SENTINEL 272 */ 273 static UChar32 274 utf8_nextCharSafeBodyTerminated(const uint8_t **ps, UChar32 c) { 275 const uint8_t *s=*ps; 276 uint8_t trail, illegal=0; 277 uint8_t count=U8_COUNT_TRAIL_BYTES(c); 278 U_ASSERT(count<6); 279 U8_MASK_LEAD_BYTE((c), count); 280 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ 281 switch(count) { 282 /* each branch falls through to the next one */ 283 case 5: 284 case 4: 285 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ 286 illegal=1; 287 break; 288 case 3: 289 trail=(uint8_t)(*s++ - 0x80); 290 c=(c<<6)|trail; 291 if(trail>0x3f || c>=0x110) { 292 /* not a trail byte, or code point>0x10ffff (outside Unicode) */ 293 illegal=1; 294 break; 295 } 296 case 2: /*fall through*/ 297 trail=(uint8_t)(*s++ - 0x80); 298 if(trail>0x3f) { 299 /* not a trail byte */ 300 illegal=1; 301 break; 302 } 303 c=(c<<6)|trail; 304 case 1: /*fall through*/ 305 trail=(uint8_t)(*s++ - 0x80); 306 if(trail>0x3f) { 307 /* not a trail byte */ 308 illegal=1; 309 } 310 c=(c<<6)|trail; 311 break; 312 case 0: 313 return U_SENTINEL; 314 /* no default branch to optimize switch() - all values are covered */ 315 } 316 317 /* correct sequence - all trail bytes have (b7..b6)==(10)? */ 318 /* illegal is also set if count>=4 */ 319 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) { 320 /* error handling */ 321 /* don't go beyond this sequence */ 322 s=*ps; 323 while(count>0 && U8_IS_TRAIL(*s)) { 324 ++s; 325 --count; 326 } 327 c=U_SENTINEL; 328 } 329 *ps=s; 330 return c; 331 } 332 333 /* 334 * Version of utf8_nextCharSafeBody() with the following differences: 335 * - works with pointers instead of indexes 336 * - always strict (strict==-1) 337 * 338 * *ps points to after the lead byte and will be moved to after the last trail byte. 339 * c is the lead byte. 340 * @return the code point, or U_SENTINEL 341 */ 342 static UChar32 343 utf8_nextCharSafeBodyPointer(const uint8_t **ps, const uint8_t *limit, UChar32 c) { 344 const uint8_t *s=*ps; 345 uint8_t trail, illegal=0; 346 uint8_t count=U8_COUNT_TRAIL_BYTES(c); 347 if((limit-s)>=count) { 348 U8_MASK_LEAD_BYTE((c), count); 349 /* count==0 for illegally leading trail bytes and the illegal bytes 0xfe and 0xff */ 350 switch(count) { 351 /* each branch falls through to the next one */ 352 case 5: 353 case 4: 354 /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ 355 illegal=1; 356 break; 357 case 3: 358 trail=*s++; 359 c=(c<<6)|(trail&0x3f); 360 if(c<0x110) { 361 illegal|=(trail&0xc0)^0x80; 362 } else { 363 /* code point>0x10ffff, outside Unicode */ 364 illegal=1; 365 break; 366 } 367 case 2: /*fall through*/ 368 trail=*s++; 369 c=(c<<6)|(trail&0x3f); 370 illegal|=(trail&0xc0)^0x80; 371 case 1: /*fall through*/ 372 trail=*s++; 373 c=(c<<6)|(trail&0x3f); 374 illegal|=(trail&0xc0)^0x80; 375 break; 376 case 0: 377 return U_SENTINEL; 378 /* no default branch to optimize switch() - all values are covered */ 379 } 380 } else { 381 illegal=1; /* too few bytes left */ 382 } 383 384 /* correct sequence - all trail bytes have (b7..b6)==(10)? */ 385 /* illegal is also set if count>=4 */ 386 U_ASSERT(illegal || count<LENGTHOF(utf8_minLegal)); 387 if(illegal || c<utf8_minLegal[count] || U_IS_SURROGATE(c)) { 388 /* error handling */ 389 /* don't go beyond this sequence */ 390 s=*ps; 391 while(count>0 && s<limit && U8_IS_TRAIL(*s)) { 392 ++s; 393 --count; 394 } 395 c=U_SENTINEL; 396 } 397 *ps=s; 398 return c; 399 } 400 401 U_CAPI UChar* U_EXPORT2 402 u_strFromUTF8WithSub(UChar *dest, 403 int32_t destCapacity, 404 int32_t *pDestLength, 405 const char* src, 406 int32_t srcLength, 407 UChar32 subchar, int32_t *pNumSubstitutions, 408 UErrorCode *pErrorCode){ 409 UChar *pDest = dest; 410 UChar *pDestLimit = dest+destCapacity; 411 UChar32 ch; 412 int32_t reqLength = 0; 413 const uint8_t* pSrc = (const uint8_t*) src; 414 uint8_t t1, t2; /* trail bytes */ 415 int32_t numSubstitutions; 416 417 /* args check */ 418 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 419 return NULL; 420 } 421 422 if( (src==NULL && srcLength!=0) || srcLength < -1 || 423 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 424 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 425 ) { 426 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 427 return NULL; 428 } 429 430 if(pNumSubstitutions!=NULL) { 431 *pNumSubstitutions=0; 432 } 433 numSubstitutions=0; 434 435 /* 436 * Inline processing of UTF-8 byte sequences: 437 * 438 * Byte sequences for the most common characters are handled inline in 439 * the conversion loops. In order to reduce the path lengths for those 440 * characters, the tests are arranged in a kind of binary search. 441 * ASCII (<=0x7f) is checked first, followed by the dividing point 442 * between 2- and 3-byte sequences (0xe0). 443 * The 3-byte branch is tested first to speed up CJK text. 444 * The compiler should combine the subtractions for the two tests for 0xe0. 445 * Each branch then tests for the other end of its range. 446 */ 447 448 if(srcLength < 0){ 449 /* 450 * Transform a NUL-terminated string. 451 * The code explicitly checks for NULs only in the lead byte position. 452 * A NUL byte in the trail byte position fails the trail byte range check anyway. 453 */ 454 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { 455 if(ch <= 0x7f){ 456 *pDest++=(UChar)ch; 457 ++pSrc; 458 } else { 459 if(ch > 0xe0) { 460 if( /* handle U+1000..U+CFFF inline */ 461 ch <= 0xec && 462 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 463 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 464 ) { 465 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 466 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 467 pSrc += 3; 468 continue; 469 } 470 } else if(ch < 0xe0) { 471 if( /* handle U+0080..U+07FF inline */ 472 ch >= 0xc2 && 473 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 474 ) { 475 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 476 pSrc += 2; 477 continue; 478 } 479 } 480 481 /* function call for "complicated" and error cases */ 482 ++pSrc; /* continue after the lead byte */ 483 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); 484 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { 485 *pErrorCode = U_INVALID_CHAR_FOUND; 486 return NULL; 487 } else if(ch<=0xFFFF) { 488 *(pDest++)=(UChar)ch; 489 } else { 490 *(pDest++)=U16_LEAD(ch); 491 if(pDest<pDestLimit) { 492 *(pDest++)=U16_TRAIL(ch); 493 } else { 494 reqLength++; 495 break; 496 } 497 } 498 } 499 } 500 501 /* Pre-flight the rest of the string. */ 502 while((ch = *pSrc) != 0) { 503 if(ch <= 0x7f){ 504 ++reqLength; 505 ++pSrc; 506 } else { 507 if(ch > 0xe0) { 508 if( /* handle U+1000..U+CFFF inline */ 509 ch <= 0xec && 510 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 511 (uint8_t)(pSrc[2] - 0x80) <= 0x3f 512 ) { 513 ++reqLength; 514 pSrc += 3; 515 continue; 516 } 517 } else if(ch < 0xe0) { 518 if( /* handle U+0080..U+07FF inline */ 519 ch >= 0xc2 && 520 (uint8_t)(pSrc[1] - 0x80) <= 0x3f 521 ) { 522 ++reqLength; 523 pSrc += 2; 524 continue; 525 } 526 } 527 528 /* function call for "complicated" and error cases */ 529 ++pSrc; /* continue after the lead byte */ 530 ch=utf8_nextCharSafeBodyTerminated(&pSrc, ch); 531 if(ch<0 && (++numSubstitutions, ch = subchar) < 0) { 532 *pErrorCode = U_INVALID_CHAR_FOUND; 533 return NULL; 534 } 535 reqLength += U16_LENGTH(ch); 536 } 537 } 538 } else /* srcLength >= 0 */ { 539 const uint8_t *pSrcLimit = pSrc + srcLength; 540 int32_t count; 541 542 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 543 for(;;) { 544 /* 545 * Each iteration of the inner loop progresses by at most 3 UTF-8 546 * bytes and one UChar, for most characters. 547 * For supplementary code points (4 & 2), which are rare, 548 * there is an additional adjustment. 549 */ 550 count = (int32_t)(pDestLimit - pDest); 551 srcLength = (int32_t)((pSrcLimit - pSrc) / 3); 552 if(count > srcLength) { 553 count = srcLength; /* min(remaining dest, remaining src/3) */ 554 } 555 if(count < 3) { 556 /* 557 * Too much overhead if we get near the end of the string, 558 * continue with the next loop. 559 */ 560 break; 561 } 562 563 do { 564 ch = *pSrc; 565 if(ch <= 0x7f){ 566 *pDest++=(UChar)ch; 567 ++pSrc; 568 } else { 569 if(ch > 0xe0) { 570 if( /* handle U+1000..U+CFFF inline */ 571 ch <= 0xec && 572 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 573 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 574 ) { 575 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 576 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 577 pSrc += 3; 578 continue; 579 } 580 } else if(ch < 0xe0) { 581 if( /* handle U+0080..U+07FF inline */ 582 ch >= 0xc2 && 583 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 584 ) { 585 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 586 pSrc += 2; 587 continue; 588 } 589 } 590 591 if(ch >= 0xf0 || subchar > 0xffff) { 592 /* 593 * We may read up to six bytes and write up to two UChars, 594 * which we didn't account for with computing count, 595 * so we adjust it here. 596 */ 597 if(--count == 0) { 598 break; 599 } 600 } 601 602 /* function call for "complicated" and error cases */ 603 ++pSrc; /* continue after the lead byte */ 604 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 605 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 606 *pErrorCode = U_INVALID_CHAR_FOUND; 607 return NULL; 608 }else if(ch<=0xFFFF){ 609 *(pDest++)=(UChar)ch; 610 }else{ 611 *(pDest++)=U16_LEAD(ch); 612 *(pDest++)=U16_TRAIL(ch); 613 } 614 } 615 } while(--count > 0); 616 } 617 618 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { 619 ch = *pSrc; 620 if(ch <= 0x7f){ 621 *pDest++=(UChar)ch; 622 ++pSrc; 623 } else { 624 if(ch > 0xe0) { 625 if( /* handle U+1000..U+CFFF inline */ 626 ch <= 0xec && 627 ((pSrcLimit - pSrc) >= 3) && 628 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 629 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 630 ) { 631 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 632 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 633 pSrc += 3; 634 continue; 635 } 636 } else if(ch < 0xe0) { 637 if( /* handle U+0080..U+07FF inline */ 638 ch >= 0xc2 && 639 ((pSrcLimit - pSrc) >= 2) && 640 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 641 ) { 642 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 643 pSrc += 2; 644 continue; 645 } 646 } 647 648 /* function call for "complicated" and error cases */ 649 ++pSrc; /* continue after the lead byte */ 650 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 651 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 652 *pErrorCode = U_INVALID_CHAR_FOUND; 653 return NULL; 654 }else if(ch<=0xFFFF){ 655 *(pDest++)=(UChar)ch; 656 }else{ 657 *(pDest++)=U16_LEAD(ch); 658 if(pDest<pDestLimit){ 659 *(pDest++)=U16_TRAIL(ch); 660 }else{ 661 reqLength++; 662 break; 663 } 664 } 665 } 666 } 667 /* do not fill the dest buffer just count the UChars needed */ 668 while(pSrc < pSrcLimit){ 669 ch = *pSrc; 670 if(ch <= 0x7f){ 671 reqLength++; 672 ++pSrc; 673 } else { 674 if(ch > 0xe0) { 675 if( /* handle U+1000..U+CFFF inline */ 676 ch <= 0xec && 677 ((pSrcLimit - pSrc) >= 3) && 678 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 679 (uint8_t)(pSrc[2] - 0x80) <= 0x3f 680 ) { 681 reqLength++; 682 pSrc += 3; 683 continue; 684 } 685 } else if(ch < 0xe0) { 686 if( /* handle U+0080..U+07FF inline */ 687 ch >= 0xc2 && 688 ((pSrcLimit - pSrc) >= 2) && 689 (uint8_t)(pSrc[1] - 0x80) <= 0x3f 690 ) { 691 reqLength++; 692 pSrc += 2; 693 continue; 694 } 695 } 696 697 /* function call for "complicated" and error cases */ 698 ++pSrc; /* continue after the lead byte */ 699 ch=utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 700 if(ch<0 && (++numSubstitutions, ch = subchar) < 0){ 701 *pErrorCode = U_INVALID_CHAR_FOUND; 702 return NULL; 703 } 704 reqLength+=U16_LENGTH(ch); 705 } 706 } 707 } 708 709 reqLength+=(int32_t)(pDest - dest); 710 711 if(pNumSubstitutions!=NULL) { 712 *pNumSubstitutions=numSubstitutions; 713 } 714 715 if(pDestLength){ 716 *pDestLength = reqLength; 717 } 718 719 /* Terminate the buffer */ 720 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 721 722 return dest; 723 } 724 725 U_CAPI UChar* U_EXPORT2 726 u_strFromUTF8(UChar *dest, 727 int32_t destCapacity, 728 int32_t *pDestLength, 729 const char* src, 730 int32_t srcLength, 731 UErrorCode *pErrorCode){ 732 return u_strFromUTF8WithSub( 733 dest, destCapacity, pDestLength, 734 src, srcLength, 735 U_SENTINEL, NULL, 736 pErrorCode); 737 } 738 739 U_CAPI UChar * U_EXPORT2 740 u_strFromUTF8Lenient(UChar *dest, 741 int32_t destCapacity, 742 int32_t *pDestLength, 743 const char *src, 744 int32_t srcLength, 745 UErrorCode *pErrorCode) { 746 UChar *pDest = dest; 747 UChar32 ch; 748 int32_t reqLength = 0; 749 uint8_t* pSrc = (uint8_t*) src; 750 751 /* args check */ 752 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 753 return NULL; 754 } 755 756 if( (src==NULL && srcLength!=0) || srcLength < -1 || 757 (destCapacity<0) || (dest == NULL && destCapacity > 0) 758 ) { 759 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 760 return NULL; 761 } 762 763 if(srcLength < 0) { 764 /* Transform a NUL-terminated string. */ 765 UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL; 766 uint8_t t1, t2, t3; /* trail bytes */ 767 768 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { 769 if(ch < 0xc0) { 770 /* 771 * ASCII, or a trail byte in lead position which is treated like 772 * a single-byte sequence for better character boundary 773 * resynchronization after illegal sequences. 774 */ 775 *pDest++=(UChar)ch; 776 ++pSrc; 777 continue; 778 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 779 if((t1 = pSrc[1]) != 0) { 780 /* 0x3080 = (0xc0 << 6) + 0x80 */ 781 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080); 782 pSrc += 2; 783 continue; 784 } 785 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 786 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) { 787 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 788 /* 0x2080 = (0x80 << 6) + 0x80 */ 789 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080); 790 pSrc += 3; 791 continue; 792 } 793 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 794 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) { 795 pSrc += 4; 796 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 797 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080; 798 *(pDest++) = U16_LEAD(ch); 799 if(pDest < pDestLimit) { 800 *(pDest++) = U16_TRAIL(ch); 801 } else { 802 reqLength = 1; 803 break; 804 } 805 continue; 806 } 807 } 808 809 /* truncated character at the end */ 810 *pDest++ = 0xfffd; 811 while(*++pSrc != 0) {} 812 break; 813 } 814 815 /* Pre-flight the rest of the string. */ 816 while((ch = *pSrc) != 0) { 817 if(ch < 0xc0) { 818 /* 819 * ASCII, or a trail byte in lead position which is treated like 820 * a single-byte sequence for better character boundary 821 * resynchronization after illegal sequences. 822 */ 823 ++reqLength; 824 ++pSrc; 825 continue; 826 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 827 if(pSrc[1] != 0) { 828 ++reqLength; 829 pSrc += 2; 830 continue; 831 } 832 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 833 if(pSrc[1] != 0 && pSrc[2] != 0) { 834 ++reqLength; 835 pSrc += 3; 836 continue; 837 } 838 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 839 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) { 840 reqLength += 2; 841 pSrc += 4; 842 continue; 843 } 844 } 845 846 /* truncated character at the end */ 847 ++reqLength; 848 break; 849 } 850 } else /* srcLength >= 0 */ { 851 const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL; 852 853 /* 854 * This function requires that if srcLength is given, then it must be 855 * destCapatity >= srcLength so that we need not check for 856 * destination buffer overflow in the loop. 857 */ 858 if(destCapacity < srcLength) { 859 if(pDestLength != NULL) { 860 *pDestLength = srcLength; /* this likely overestimates the true destLength! */ 861 } 862 *pErrorCode = U_BUFFER_OVERFLOW_ERROR; 863 return NULL; 864 } 865 866 if((pSrcLimit - pSrc) >= 4) { 867 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */ 868 869 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */ 870 do { 871 ch = *pSrc++; 872 if(ch < 0xc0) { 873 /* 874 * ASCII, or a trail byte in lead position which is treated like 875 * a single-byte sequence for better character boundary 876 * resynchronization after illegal sequences. 877 */ 878 *pDest++=(UChar)ch; 879 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 880 /* 0x3080 = (0xc0 << 6) + 0x80 */ 881 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); 882 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 883 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 884 /* 0x2080 = (0x80 << 6) + 0x80 */ 885 ch = (ch << 12) + (*pSrc++ << 6); 886 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); 887 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 888 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 889 ch = (ch << 18) + (*pSrc++ << 12); 890 ch += *pSrc++ << 6; 891 ch += *pSrc++ - 0x3c82080; 892 *(pDest++) = U16_LEAD(ch); 893 *(pDest++) = U16_TRAIL(ch); 894 } 895 } while(pSrc < pSrcLimit); 896 897 pSrcLimit += 3; /* restore original pSrcLimit */ 898 } 899 900 while(pSrc < pSrcLimit) { 901 ch = *pSrc++; 902 if(ch < 0xc0) { 903 /* 904 * ASCII, or a trail byte in lead position which is treated like 905 * a single-byte sequence for better character boundary 906 * resynchronization after illegal sequences. 907 */ 908 *pDest++=(UChar)ch; 909 continue; 910 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 911 if(pSrc < pSrcLimit) { 912 /* 0x3080 = (0xc0 << 6) + 0x80 */ 913 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); 914 continue; 915 } 916 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 917 if((pSrcLimit - pSrc) >= 2) { 918 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 919 /* 0x2080 = (0x80 << 6) + 0x80 */ 920 ch = (ch << 12) + (*pSrc++ << 6); 921 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); 922 pSrc += 3; 923 continue; 924 } 925 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 926 if((pSrcLimit - pSrc) >= 3) { 927 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 928 ch = (ch << 18) + (*pSrc++ << 12); 929 ch += *pSrc++ << 6; 930 ch += *pSrc++ - 0x3c82080; 931 *(pDest++) = U16_LEAD(ch); 932 *(pDest++) = U16_TRAIL(ch); 933 pSrc += 4; 934 continue; 935 } 936 } 937 938 /* truncated character at the end */ 939 *pDest++ = 0xfffd; 940 break; 941 } 942 } 943 944 reqLength+=(int32_t)(pDest - dest); 945 946 if(pDestLength){ 947 *pDestLength = reqLength; 948 } 949 950 /* Terminate the buffer */ 951 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 952 953 return dest; 954 } 955 956 static inline uint8_t * 957 _appendUTF8(uint8_t *pDest, UChar32 c) { 958 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */ 959 if((c)<=0x7f) { 960 *pDest++=(uint8_t)c; 961 } else if(c<=0x7ff) { 962 *pDest++=(uint8_t)((c>>6)|0xc0); 963 *pDest++=(uint8_t)((c&0x3f)|0x80); 964 } else if(c<=0xffff) { 965 *pDest++=(uint8_t)((c>>12)|0xe0); 966 *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80); 967 *pDest++=(uint8_t)(((c)&0x3f)|0x80); 968 } else /* if((uint32_t)(c)<=0x10ffff) */ { 969 *pDest++=(uint8_t)(((c)>>18)|0xf0); 970 *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80); 971 *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80); 972 *pDest++=(uint8_t)(((c)&0x3f)|0x80); 973 } 974 return pDest; 975 } 976 977 978 U_CAPI char* U_EXPORT2 979 u_strToUTF8WithSub(char *dest, 980 int32_t destCapacity, 981 int32_t *pDestLength, 982 const UChar *pSrc, 983 int32_t srcLength, 984 UChar32 subchar, int32_t *pNumSubstitutions, 985 UErrorCode *pErrorCode){ 986 int32_t reqLength=0; 987 uint32_t ch=0,ch2=0; 988 uint8_t *pDest = (uint8_t *)dest; 989 uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL; 990 int32_t numSubstitutions; 991 992 /* args check */ 993 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)){ 994 return NULL; 995 } 996 997 if( (pSrc==NULL && srcLength!=0) || srcLength < -1 || 998 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 999 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 1000 ) { 1001 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1002 return NULL; 1003 } 1004 1005 if(pNumSubstitutions!=NULL) { 1006 *pNumSubstitutions=0; 1007 } 1008 numSubstitutions=0; 1009 1010 if(srcLength==-1) { 1011 while((ch=*pSrc)!=0) { 1012 ++pSrc; 1013 if(ch <= 0x7f) { 1014 if(pDest<pDestLimit) { 1015 *pDest++ = (uint8_t)ch; 1016 } else { 1017 reqLength = 1; 1018 break; 1019 } 1020 } else if(ch <= 0x7ff) { 1021 if((pDestLimit - pDest) >= 2) { 1022 *pDest++=(uint8_t)((ch>>6)|0xc0); 1023 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1024 } else { 1025 reqLength = 2; 1026 break; 1027 } 1028 } else if(ch <= 0xd7ff || ch >= 0xe000) { 1029 if((pDestLimit - pDest) >= 3) { 1030 *pDest++=(uint8_t)((ch>>12)|0xe0); 1031 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1032 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1033 } else { 1034 reqLength = 3; 1035 break; 1036 } 1037 } else /* ch is a surrogate */ { 1038 int32_t length; 1039 1040 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/ 1041 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 1042 ++pSrc; 1043 ch=U16_GET_SUPPLEMENTARY(ch, ch2); 1044 } else if(subchar>=0) { 1045 ch=subchar; 1046 ++numSubstitutions; 1047 } else { 1048 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1049 *pErrorCode = U_INVALID_CHAR_FOUND; 1050 return NULL; 1051 } 1052 1053 length = U8_LENGTH(ch); 1054 if((pDestLimit - pDest) >= length) { 1055 /* convert and append*/ 1056 pDest=_appendUTF8(pDest, ch); 1057 } else { 1058 reqLength = length; 1059 break; 1060 } 1061 } 1062 } 1063 while((ch=*pSrc++)!=0) { 1064 if(ch<=0x7f) { 1065 ++reqLength; 1066 } else if(ch<=0x7ff) { 1067 reqLength+=2; 1068 } else if(!U16_IS_SURROGATE(ch)) { 1069 reqLength+=3; 1070 } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 1071 ++pSrc; 1072 reqLength+=4; 1073 } else if(subchar>=0) { 1074 reqLength+=U8_LENGTH(subchar); 1075 ++numSubstitutions; 1076 } else { 1077 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1078 *pErrorCode = U_INVALID_CHAR_FOUND; 1079 return NULL; 1080 } 1081 } 1082 } else { 1083 const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL; 1084 int32_t count; 1085 1086 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1087 for(;;) { 1088 /* 1089 * Each iteration of the inner loop progresses by at most 3 UTF-8 1090 * bytes and one UChar, for most characters. 1091 * For supplementary code points (4 & 2), which are rare, 1092 * there is an additional adjustment. 1093 */ 1094 count = (int32_t)((pDestLimit - pDest) / 3); 1095 srcLength = (int32_t)(pSrcLimit - pSrc); 1096 if(count > srcLength) { 1097 count = srcLength; /* min(remaining dest/3, remaining src) */ 1098 } 1099 if(count < 3) { 1100 /* 1101 * Too much overhead if we get near the end of the string, 1102 * continue with the next loop. 1103 */ 1104 break; 1105 } 1106 do { 1107 ch=*pSrc++; 1108 if(ch <= 0x7f) { 1109 *pDest++ = (uint8_t)ch; 1110 } else if(ch <= 0x7ff) { 1111 *pDest++=(uint8_t)((ch>>6)|0xc0); 1112 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1113 } else if(ch <= 0xd7ff || ch >= 0xe000) { 1114 *pDest++=(uint8_t)((ch>>12)|0xe0); 1115 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1116 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1117 } else /* ch is a surrogate */ { 1118 /* 1119 * We will read two UChars and probably output four bytes, 1120 * which we didn't account for with computing count, 1121 * so we adjust it here. 1122 */ 1123 if(--count == 0) { 1124 --pSrc; /* undo ch=*pSrc++ for the lead surrogate */ 1125 break; /* recompute count */ 1126 } 1127 1128 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 1129 ++pSrc; 1130 ch=U16_GET_SUPPLEMENTARY(ch, ch2); 1131 1132 /* writing 4 bytes per 2 UChars is ok */ 1133 *pDest++=(uint8_t)((ch>>18)|0xf0); 1134 *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80); 1135 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1136 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1137 } else { 1138 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1139 if(subchar>=0) { 1140 ch=subchar; 1141 ++numSubstitutions; 1142 } else { 1143 *pErrorCode = U_INVALID_CHAR_FOUND; 1144 return NULL; 1145 } 1146 1147 /* convert and append*/ 1148 pDest=_appendUTF8(pDest, ch); 1149 } 1150 } 1151 } while(--count > 0); 1152 } 1153 1154 while(pSrc<pSrcLimit) { 1155 ch=*pSrc++; 1156 if(ch <= 0x7f) { 1157 if(pDest<pDestLimit) { 1158 *pDest++ = (uint8_t)ch; 1159 } else { 1160 reqLength = 1; 1161 break; 1162 } 1163 } else if(ch <= 0x7ff) { 1164 if((pDestLimit - pDest) >= 2) { 1165 *pDest++=(uint8_t)((ch>>6)|0xc0); 1166 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1167 } else { 1168 reqLength = 2; 1169 break; 1170 } 1171 } else if(ch <= 0xd7ff || ch >= 0xe000) { 1172 if((pDestLimit - pDest) >= 3) { 1173 *pDest++=(uint8_t)((ch>>12)|0xe0); 1174 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1175 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1176 } else { 1177 reqLength = 3; 1178 break; 1179 } 1180 } else /* ch is a surrogate */ { 1181 int32_t length; 1182 1183 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { 1184 ++pSrc; 1185 ch=U16_GET_SUPPLEMENTARY(ch, ch2); 1186 } else if(subchar>=0) { 1187 ch=subchar; 1188 ++numSubstitutions; 1189 } else { 1190 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1191 *pErrorCode = U_INVALID_CHAR_FOUND; 1192 return NULL; 1193 } 1194 1195 length = U8_LENGTH(ch); 1196 if((pDestLimit - pDest) >= length) { 1197 /* convert and append*/ 1198 pDest=_appendUTF8(pDest, ch); 1199 } else { 1200 reqLength = length; 1201 break; 1202 } 1203 } 1204 } 1205 while(pSrc<pSrcLimit) { 1206 ch=*pSrc++; 1207 if(ch<=0x7f) { 1208 ++reqLength; 1209 } else if(ch<=0x7ff) { 1210 reqLength+=2; 1211 } else if(!U16_IS_SURROGATE(ch)) { 1212 reqLength+=3; 1213 } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { 1214 ++pSrc; 1215 reqLength+=4; 1216 } else if(subchar>=0) { 1217 reqLength+=U8_LENGTH(subchar); 1218 ++numSubstitutions; 1219 } else { 1220 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1221 *pErrorCode = U_INVALID_CHAR_FOUND; 1222 return NULL; 1223 } 1224 } 1225 } 1226 1227 reqLength+=(int32_t)(pDest - (uint8_t *)dest); 1228 1229 if(pNumSubstitutions!=NULL) { 1230 *pNumSubstitutions=numSubstitutions; 1231 } 1232 1233 if(pDestLength){ 1234 *pDestLength = reqLength; 1235 } 1236 1237 /* Terminate the buffer */ 1238 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1239 return dest; 1240 } 1241 1242 U_CAPI char* U_EXPORT2 1243 u_strToUTF8(char *dest, 1244 int32_t destCapacity, 1245 int32_t *pDestLength, 1246 const UChar *pSrc, 1247 int32_t srcLength, 1248 UErrorCode *pErrorCode){ 1249 return u_strToUTF8WithSub( 1250 dest, destCapacity, pDestLength, 1251 pSrc, srcLength, 1252 U_SENTINEL, NULL, 1253 pErrorCode); 1254 } 1255 1256 U_CAPI UChar* U_EXPORT2 1257 u_strFromJavaModifiedUTF8WithSub( 1258 UChar *dest, 1259 int32_t destCapacity, 1260 int32_t *pDestLength, 1261 const char *src, 1262 int32_t srcLength, 1263 UChar32 subchar, int32_t *pNumSubstitutions, 1264 UErrorCode *pErrorCode) { 1265 UChar *pDest = dest; 1266 UChar *pDestLimit = dest+destCapacity; 1267 UChar32 ch; 1268 int32_t reqLength = 0; 1269 const uint8_t* pSrc = (const uint8_t*) src; 1270 const uint8_t *pSrcLimit; 1271 int32_t count; 1272 uint8_t t1, t2; /* trail bytes */ 1273 int32_t numSubstitutions; 1274 1275 /* args check */ 1276 if(U_FAILURE(*pErrorCode)){ 1277 return NULL; 1278 } 1279 if( (src==NULL && srcLength!=0) || srcLength < -1 || 1280 (dest==NULL && destCapacity!=0) || destCapacity<0 || 1281 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 1282 ) { 1283 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1284 return NULL; 1285 } 1286 1287 if(pNumSubstitutions!=NULL) { 1288 *pNumSubstitutions=0; 1289 } 1290 numSubstitutions=0; 1291 1292 if(srcLength < 0) { 1293 /* 1294 * Transform a NUL-terminated ASCII string. 1295 * Handle non-ASCII strings with slower code. 1296 */ 1297 while(((ch = *pSrc) != 0) && ch <= 0x7f && (pDest < pDestLimit)) { 1298 *pDest++=(UChar)ch; 1299 ++pSrc; 1300 } 1301 if(ch == 0) { 1302 reqLength=(int32_t)(pDest - dest); 1303 if(pDestLength) { 1304 *pDestLength = reqLength; 1305 } 1306 1307 /* Terminate the buffer */ 1308 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 1309 return dest; 1310 } 1311 srcLength = uprv_strlen((const char *)pSrc); 1312 } 1313 1314 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1315 pSrcLimit = (pSrc == NULL) ? NULL : pSrc + srcLength; 1316 for(;;) { 1317 count = (int32_t)(pDestLimit - pDest); 1318 srcLength = (int32_t)(pSrcLimit - pSrc); 1319 if(count >= srcLength && srcLength > 0 && *pSrc <= 0x7f) { 1320 /* fast ASCII loop */ 1321 const uint8_t *prevSrc = pSrc; 1322 int32_t delta; 1323 while(pSrc < pSrcLimit && (ch = *pSrc) <= 0x7f) { 1324 *pDest++=(UChar)ch; 1325 ++pSrc; 1326 } 1327 delta = (int32_t)(pSrc - prevSrc); 1328 count -= delta; 1329 srcLength -= delta; 1330 } 1331 /* 1332 * Each iteration of the inner loop progresses by at most 3 UTF-8 1333 * bytes and one UChar. 1334 */ 1335 srcLength /= 3; 1336 if(count > srcLength) { 1337 count = srcLength; /* min(remaining dest, remaining src/3) */ 1338 } 1339 if(count < 3) { 1340 /* 1341 * Too much overhead if we get near the end of the string, 1342 * continue with the next loop. 1343 */ 1344 break; 1345 } 1346 do { 1347 ch = *pSrc; 1348 if(ch <= 0x7f){ 1349 *pDest++=(UChar)ch; 1350 ++pSrc; 1351 } else { 1352 if(ch >= 0xe0) { 1353 if( /* handle U+0000..U+FFFF inline */ 1354 ch <= 0xef && 1355 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 1356 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 1357 ) { 1358 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1359 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 1360 pSrc += 3; 1361 continue; 1362 } 1363 } else { 1364 if( /* handle U+0000..U+07FF inline */ 1365 ch >= 0xc0 && 1366 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 1367 ) { 1368 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 1369 pSrc += 2; 1370 continue; 1371 } 1372 } 1373 1374 if(subchar < 0) { 1375 *pErrorCode = U_INVALID_CHAR_FOUND; 1376 return NULL; 1377 } else if(subchar > 0xffff && --count == 0) { 1378 /* 1379 * We need to write two UChars, adjusted count for that, 1380 * and ran out of space. 1381 */ 1382 break; 1383 } else { 1384 /* function call for error cases */ 1385 ++pSrc; /* continue after the lead byte */ 1386 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1387 ++numSubstitutions; 1388 if(subchar<=0xFFFF) { 1389 *(pDest++)=(UChar)subchar; 1390 } else { 1391 *(pDest++)=U16_LEAD(subchar); 1392 *(pDest++)=U16_TRAIL(subchar); 1393 } 1394 } 1395 } 1396 } while(--count > 0); 1397 } 1398 1399 while((pSrc<pSrcLimit) && (pDest<pDestLimit)) { 1400 ch = *pSrc; 1401 if(ch <= 0x7f){ 1402 *pDest++=(UChar)ch; 1403 ++pSrc; 1404 } else { 1405 if(ch >= 0xe0) { 1406 if( /* handle U+0000..U+FFFF inline */ 1407 ch <= 0xef && 1408 ((pSrcLimit - pSrc) >= 3) && 1409 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f && 1410 (t2 = (uint8_t)(pSrc[2] - 0x80)) <= 0x3f 1411 ) { 1412 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1413 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 1414 pSrc += 3; 1415 continue; 1416 } 1417 } else { 1418 if( /* handle U+0000..U+07FF inline */ 1419 ch >= 0xc0 && 1420 ((pSrcLimit - pSrc) >= 2) && 1421 (t1 = (uint8_t)(pSrc[1] - 0x80)) <= 0x3f 1422 ) { 1423 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 1424 pSrc += 2; 1425 continue; 1426 } 1427 } 1428 1429 if(subchar < 0) { 1430 *pErrorCode = U_INVALID_CHAR_FOUND; 1431 return NULL; 1432 } else { 1433 /* function call for error cases */ 1434 ++pSrc; /* continue after the lead byte */ 1435 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1436 ++numSubstitutions; 1437 if(subchar<=0xFFFF) { 1438 *(pDest++)=(UChar)subchar; 1439 } else { 1440 *(pDest++)=U16_LEAD(subchar); 1441 if(pDest<pDestLimit) { 1442 *(pDest++)=U16_TRAIL(subchar); 1443 } else { 1444 reqLength++; 1445 break; 1446 } 1447 } 1448 } 1449 } 1450 } 1451 1452 /* do not fill the dest buffer just count the UChars needed */ 1453 while(pSrc < pSrcLimit){ 1454 ch = *pSrc; 1455 if(ch <= 0x7f) { 1456 reqLength++; 1457 ++pSrc; 1458 } else { 1459 if(ch >= 0xe0) { 1460 if( /* handle U+0000..U+FFFF inline */ 1461 ch <= 0xef && 1462 ((pSrcLimit - pSrc) >= 3) && 1463 (uint8_t)(pSrc[1] - 0x80) <= 0x3f && 1464 (uint8_t)(pSrc[2] - 0x80) <= 0x3f 1465 ) { 1466 reqLength++; 1467 pSrc += 3; 1468 continue; 1469 } 1470 } else { 1471 if( /* handle U+0000..U+07FF inline */ 1472 ch >= 0xc0 && 1473 ((pSrcLimit - pSrc) >= 2) && 1474 (uint8_t)(pSrc[1] - 0x80) <= 0x3f 1475 ) { 1476 reqLength++; 1477 pSrc += 2; 1478 continue; 1479 } 1480 } 1481 1482 if(subchar < 0) { 1483 *pErrorCode = U_INVALID_CHAR_FOUND; 1484 return NULL; 1485 } else { 1486 /* function call for error cases */ 1487 ++pSrc; /* continue after the lead byte */ 1488 utf8_nextCharSafeBodyPointer(&pSrc, pSrcLimit, ch); 1489 ++numSubstitutions; 1490 reqLength+=U16_LENGTH(ch); 1491 } 1492 } 1493 } 1494 1495 if(pNumSubstitutions!=NULL) { 1496 *pNumSubstitutions=numSubstitutions; 1497 } 1498 1499 reqLength+=(int32_t)(pDest - dest); 1500 if(pDestLength) { 1501 *pDestLength = reqLength; 1502 } 1503 1504 /* Terminate the buffer */ 1505 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 1506 return dest; 1507 } 1508 1509 U_CAPI char* U_EXPORT2 1510 u_strToJavaModifiedUTF8( 1511 char *dest, 1512 int32_t destCapacity, 1513 int32_t *pDestLength, 1514 const UChar *src, 1515 int32_t srcLength, 1516 UErrorCode *pErrorCode) { 1517 int32_t reqLength=0; 1518 uint32_t ch=0; 1519 uint8_t *pDest = (uint8_t *)dest; 1520 uint8_t *pDestLimit = pDest + destCapacity; 1521 const UChar *pSrcLimit; 1522 int32_t count; 1523 1524 /* args check */ 1525 if(U_FAILURE(*pErrorCode)){ 1526 return NULL; 1527 } 1528 if( (src==NULL && srcLength!=0) || srcLength < -1 || 1529 (dest==NULL && destCapacity!=0) || destCapacity<0 1530 ) { 1531 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1532 return NULL; 1533 } 1534 1535 if(srcLength==-1) { 1536 /* Convert NUL-terminated ASCII, then find the string length. */ 1537 while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) { 1538 *pDest++ = (uint8_t)ch; 1539 ++src; 1540 } 1541 if(ch == 0) { 1542 reqLength=(int32_t)(pDest - (uint8_t *)dest); 1543 if(pDestLength) { 1544 *pDestLength = reqLength; 1545 } 1546 1547 /* Terminate the buffer */ 1548 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1549 return dest; 1550 } 1551 srcLength = u_strlen(src); 1552 } 1553 1554 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1555 pSrcLimit = (src!=NULL)?(src+srcLength):NULL; 1556 for(;;) { 1557 count = (int32_t)(pDestLimit - pDest); 1558 srcLength = (int32_t)(pSrcLimit - src); 1559 if(count >= srcLength && srcLength > 0 && *src <= 0x7f) { 1560 /* fast ASCII loop */ 1561 const UChar *prevSrc = src; 1562 int32_t delta; 1563 while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) { 1564 *pDest++=(uint8_t)ch; 1565 ++src; 1566 } 1567 delta = (int32_t)(src - prevSrc); 1568 count -= delta; 1569 srcLength -= delta; 1570 } 1571 /* 1572 * Each iteration of the inner loop progresses by at most 3 UTF-8 1573 * bytes and one UChar. 1574 */ 1575 count /= 3; 1576 if(count > srcLength) { 1577 count = srcLength; /* min(remaining dest/3, remaining src) */ 1578 } 1579 if(count < 3) { 1580 /* 1581 * Too much overhead if we get near the end of the string, 1582 * continue with the next loop. 1583 */ 1584 break; 1585 } 1586 do { 1587 ch=*src++; 1588 if(ch <= 0x7f && ch != 0) { 1589 *pDest++ = (uint8_t)ch; 1590 } else if(ch <= 0x7ff) { 1591 *pDest++=(uint8_t)((ch>>6)|0xc0); 1592 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1593 } else { 1594 *pDest++=(uint8_t)((ch>>12)|0xe0); 1595 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1596 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1597 } 1598 } while(--count > 0); 1599 } 1600 1601 while(src<pSrcLimit) { 1602 ch=*src++; 1603 if(ch <= 0x7f && ch != 0) { 1604 if(pDest<pDestLimit) { 1605 *pDest++ = (uint8_t)ch; 1606 } else { 1607 reqLength = 1; 1608 break; 1609 } 1610 } else if(ch <= 0x7ff) { 1611 if((pDestLimit - pDest) >= 2) { 1612 *pDest++=(uint8_t)((ch>>6)|0xc0); 1613 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1614 } else { 1615 reqLength = 2; 1616 break; 1617 } 1618 } else { 1619 if((pDestLimit - pDest) >= 3) { 1620 *pDest++=(uint8_t)((ch>>12)|0xe0); 1621 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1622 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1623 } else { 1624 reqLength = 3; 1625 break; 1626 } 1627 } 1628 } 1629 while(src<pSrcLimit) { 1630 ch=*src++; 1631 if(ch <= 0x7f && ch != 0) { 1632 ++reqLength; 1633 } else if(ch<=0x7ff) { 1634 reqLength+=2; 1635 } else { 1636 reqLength+=3; 1637 } 1638 } 1639 1640 reqLength+=(int32_t)(pDest - (uint8_t *)dest); 1641 if(pDestLength){ 1642 *pDestLength = reqLength; 1643 } 1644 1645 /* Terminate the buffer */ 1646 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1647 return dest; 1648 } 1649