1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 2001-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ****************************************************************************** 10 * 11 * File ustrtrns.cpp 12 * 13 * Modification History: 14 * 15 * Date Name Description 16 * 9/10/2001 Ram Creation. 17 ****************************************************************************** 18 */ 19 20 /******************************************************************************* 21 * 22 * u_strTo* and u_strFrom* APIs 23 * WCS functions moved to ustr_wcs.c for better modularization 24 * 25 ******************************************************************************* 26 */ 27 28 29 #include "unicode/putil.h" 30 #include "unicode/ustring.h" 31 #include "unicode/utf.h" 32 #include "unicode/utf8.h" 33 #include "unicode/utf16.h" 34 #include "cstring.h" 35 #include "cmemory.h" 36 #include "ustr_imp.h" 37 #include "uassert.h" 38 39 U_CAPI UChar* U_EXPORT2 40 u_strFromUTF32WithSub(UChar *dest, 41 int32_t destCapacity, 42 int32_t *pDestLength, 43 const UChar32 *src, 44 int32_t srcLength, 45 UChar32 subchar, int32_t *pNumSubstitutions, 46 UErrorCode *pErrorCode) { 47 const UChar32 *srcLimit; 48 UChar32 ch; 49 UChar *destLimit; 50 UChar *pDest; 51 int32_t reqLength; 52 int32_t numSubstitutions; 53 54 /* args check */ 55 if(U_FAILURE(*pErrorCode)){ 56 return NULL; 57 } 58 if( (src==NULL && srcLength!=0) || srcLength < -1 || 59 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 60 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 61 ) { 62 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 63 return NULL; 64 } 65 66 if(pNumSubstitutions != NULL) { 67 *pNumSubstitutions = 0; 68 } 69 70 pDest = dest; 71 destLimit = (dest!=NULL)?(dest + destCapacity):NULL; 72 reqLength = 0; 73 numSubstitutions = 0; 74 75 if(srcLength < 0) { 76 /* simple loop for conversion of a NUL-terminated BMP string */ 77 while((ch=*src) != 0 && 78 ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) { 79 ++src; 80 if(pDest < destLimit) { 81 *pDest++ = (UChar)ch; 82 } else { 83 ++reqLength; 84 } 85 } 86 srcLimit = src; 87 if(ch != 0) { 88 /* "complicated" case, find the end of the remaining string */ 89 while(*++srcLimit != 0) {} 90 } 91 } else { 92 srcLimit = (src!=NULL)?(src + srcLength):NULL; 93 } 94 95 /* convert with length */ 96 while(src < srcLimit) { 97 ch = *src++; 98 do { 99 /* usually "loops" once; twice only for writing subchar */ 100 if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) { 101 if(pDest < destLimit) { 102 *pDest++ = (UChar)ch; 103 } else { 104 ++reqLength; 105 } 106 break; 107 } else if(0x10000 <= ch && ch <= 0x10ffff) { 108 if(pDest!=NULL && ((pDest + 2) <= destLimit)) { 109 *pDest++ = U16_LEAD(ch); 110 *pDest++ = U16_TRAIL(ch); 111 } else { 112 reqLength += 2; 113 } 114 break; 115 } else if((ch = subchar) < 0) { 116 /* surrogate code point, or not a Unicode code point at all */ 117 *pErrorCode = U_INVALID_CHAR_FOUND; 118 return NULL; 119 } else { 120 ++numSubstitutions; 121 } 122 } while(TRUE); 123 } 124 125 reqLength += (int32_t)(pDest - dest); 126 if(pDestLength) { 127 *pDestLength = reqLength; 128 } 129 if(pNumSubstitutions != NULL) { 130 *pNumSubstitutions = numSubstitutions; 131 } 132 133 /* Terminate the buffer */ 134 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 135 136 return dest; 137 } 138 139 U_CAPI UChar* U_EXPORT2 140 u_strFromUTF32(UChar *dest, 141 int32_t destCapacity, 142 int32_t *pDestLength, 143 const UChar32 *src, 144 int32_t srcLength, 145 UErrorCode *pErrorCode) { 146 return u_strFromUTF32WithSub( 147 dest, destCapacity, pDestLength, 148 src, srcLength, 149 U_SENTINEL, NULL, 150 pErrorCode); 151 } 152 153 U_CAPI UChar32* U_EXPORT2 154 u_strToUTF32WithSub(UChar32 *dest, 155 int32_t destCapacity, 156 int32_t *pDestLength, 157 const UChar *src, 158 int32_t srcLength, 159 UChar32 subchar, int32_t *pNumSubstitutions, 160 UErrorCode *pErrorCode) { 161 const UChar *srcLimit; 162 UChar32 ch; 163 UChar ch2; 164 UChar32 *destLimit; 165 UChar32 *pDest; 166 int32_t reqLength; 167 int32_t numSubstitutions; 168 169 /* args check */ 170 if(U_FAILURE(*pErrorCode)){ 171 return NULL; 172 } 173 if( (src==NULL && srcLength!=0) || srcLength < -1 || 174 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 175 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 176 ) { 177 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 178 return NULL; 179 } 180 181 if(pNumSubstitutions != NULL) { 182 *pNumSubstitutions = 0; 183 } 184 185 pDest = dest; 186 destLimit = (dest!=NULL)?(dest + destCapacity):NULL; 187 reqLength = 0; 188 numSubstitutions = 0; 189 190 if(srcLength < 0) { 191 /* simple loop for conversion of a NUL-terminated BMP string */ 192 while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) { 193 ++src; 194 if(pDest < destLimit) { 195 *pDest++ = ch; 196 } else { 197 ++reqLength; 198 } 199 } 200 srcLimit = src; 201 if(ch != 0) { 202 /* "complicated" case, find the end of the remaining string */ 203 while(*++srcLimit != 0) {} 204 } 205 } else { 206 srcLimit = (src!=NULL)?(src + srcLength):NULL; 207 } 208 209 /* convert with length */ 210 while(src < srcLimit) { 211 ch = *src++; 212 if(!U16_IS_SURROGATE(ch)) { 213 /* write or count ch below */ 214 } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) { 215 ++src; 216 ch = U16_GET_SUPPLEMENTARY(ch, ch2); 217 } else if((ch = subchar) < 0) { 218 /* unpaired surrogate */ 219 *pErrorCode = U_INVALID_CHAR_FOUND; 220 return NULL; 221 } else { 222 ++numSubstitutions; 223 } 224 if(pDest < destLimit) { 225 *pDest++ = ch; 226 } else { 227 ++reqLength; 228 } 229 } 230 231 reqLength += (int32_t)(pDest - dest); 232 if(pDestLength) { 233 *pDestLength = reqLength; 234 } 235 if(pNumSubstitutions != NULL) { 236 *pNumSubstitutions = numSubstitutions; 237 } 238 239 /* Terminate the buffer */ 240 u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode); 241 242 return dest; 243 } 244 245 U_CAPI UChar32* U_EXPORT2 246 u_strToUTF32(UChar32 *dest, 247 int32_t destCapacity, 248 int32_t *pDestLength, 249 const UChar *src, 250 int32_t srcLength, 251 UErrorCode *pErrorCode) { 252 return u_strToUTF32WithSub( 253 dest, destCapacity, pDestLength, 254 src, srcLength, 255 U_SENTINEL, NULL, 256 pErrorCode); 257 } 258 259 U_CAPI UChar* U_EXPORT2 260 u_strFromUTF8WithSub(UChar *dest, 261 int32_t destCapacity, 262 int32_t *pDestLength, 263 const char* src, 264 int32_t srcLength, 265 UChar32 subchar, int32_t *pNumSubstitutions, 266 UErrorCode *pErrorCode){ 267 /* args check */ 268 if(U_FAILURE(*pErrorCode)) { 269 return NULL; 270 } 271 if( (src==NULL && srcLength!=0) || srcLength < -1 || 272 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 273 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 274 ) { 275 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 276 return NULL; 277 } 278 279 if(pNumSubstitutions!=NULL) { 280 *pNumSubstitutions=0; 281 } 282 UChar *pDest = dest; 283 UChar *pDestLimit = dest+destCapacity; 284 int32_t reqLength = 0; 285 int32_t numSubstitutions=0; 286 287 /* 288 * Inline processing of UTF-8 byte sequences: 289 * 290 * Byte sequences for the most common characters are handled inline in 291 * the conversion loops. In order to reduce the path lengths for those 292 * characters, the tests are arranged in a kind of binary search. 293 * ASCII (<=0x7f) is checked first, followed by the dividing point 294 * between 2- and 3-byte sequences (0xe0). 295 * The 3-byte branch is tested first to speed up CJK text. 296 * The compiler should combine the subtractions for the two tests for 0xe0. 297 * Each branch then tests for the other end of its range. 298 */ 299 300 if(srcLength < 0){ 301 /* 302 * Transform a NUL-terminated string. 303 * The code explicitly checks for NULs only in the lead byte position. 304 * A NUL byte in the trail byte position fails the trail byte range check anyway. 305 */ 306 int32_t i; 307 UChar32 c; 308 for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) { 309 // modified copy of U8_NEXT() 310 ++i; 311 if(U8_IS_SINGLE(c)) { 312 *pDest++=(UChar)c; 313 } else { 314 uint8_t __t1, __t2; 315 if( /* handle U+0800..U+FFFF inline */ 316 (0xe0<=(c) && (c)<0xf0) && 317 U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && 318 (__t2=src[(i)+1]-0x80)<=0x3f) { 319 *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2; 320 i+=2; 321 } else if( /* handle U+0080..U+07FF inline */ 322 ((c)<0xe0 && (c)>=0xc2) && 323 (__t1=src[i]-0x80)<=0x3f) { 324 *pDest++ = (((c)&0x1f)<<6)|__t1; 325 ++(i); 326 } else { 327 /* function call for "complicated" and error cases */ 328 (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1); 329 if(c<0 && (++numSubstitutions, c = subchar) < 0) { 330 *pErrorCode = U_INVALID_CHAR_FOUND; 331 return NULL; 332 } else if(c<=0xFFFF) { 333 *(pDest++)=(UChar)c; 334 } else { 335 *(pDest++)=U16_LEAD(c); 336 if(pDest<pDestLimit) { 337 *(pDest++)=U16_TRAIL(c); 338 } else { 339 reqLength++; 340 break; 341 } 342 } 343 } 344 } 345 } 346 347 /* Pre-flight the rest of the string. */ 348 while((c = (uint8_t)src[i]) != 0) { 349 // modified copy of U8_NEXT() 350 ++i; 351 if(U8_IS_SINGLE(c)) { 352 ++reqLength; 353 } else { 354 uint8_t __t1, __t2; 355 if( /* handle U+0800..U+FFFF inline */ 356 (0xe0<=(c) && (c)<0xf0) && 357 U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && 358 (__t2=src[(i)+1]-0x80)<=0x3f) { 359 ++reqLength; 360 i+=2; 361 } else if( /* handle U+0080..U+07FF inline */ 362 ((c)<0xe0 && (c)>=0xc2) && 363 (__t1=src[i]-0x80)<=0x3f) { 364 ++reqLength; 365 ++(i); 366 } else { 367 /* function call for "complicated" and error cases */ 368 (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1); 369 if(c<0 && (++numSubstitutions, c = subchar) < 0) { 370 *pErrorCode = U_INVALID_CHAR_FOUND; 371 return NULL; 372 } 373 reqLength += U16_LENGTH(c); 374 } 375 } 376 } 377 } else /* srcLength >= 0 */ { 378 /* Faster loop without ongoing checking for srcLength and pDestLimit. */ 379 int32_t i = 0; 380 UChar32 c; 381 for(;;) { 382 /* 383 * Each iteration of the inner loop progresses by at most 3 UTF-8 384 * bytes and one UChar, for most characters. 385 * For supplementary code points (4 & 2), which are rare, 386 * there is an additional adjustment. 387 */ 388 int32_t count = (int32_t)(pDestLimit - pDest); 389 int32_t count2 = (srcLength - i) / 3; 390 if(count > count2) { 391 count = count2; /* min(remaining dest, remaining src/3) */ 392 } 393 if(count < 3) { 394 /* 395 * Too much overhead if we get near the end of the string, 396 * continue with the next loop. 397 */ 398 break; 399 } 400 401 do { 402 // modified copy of U8_NEXT() 403 c = (uint8_t)src[i++]; 404 if(U8_IS_SINGLE(c)) { 405 *pDest++=(UChar)c; 406 } else { 407 uint8_t __t1, __t2; 408 if( /* handle U+0800..U+FFFF inline */ 409 (0xe0<=(c) && (c)<0xf0) && 410 ((i)+1)<srcLength && 411 U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && 412 (__t2=src[(i)+1]-0x80)<=0x3f) { 413 *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2; 414 i+=2; 415 } else if( /* handle U+0080..U+07FF inline */ 416 ((c)<0xe0 && (c)>=0xc2) && 417 ((i)!=srcLength) && 418 (__t1=src[i]-0x80)<=0x3f) { 419 *pDest++ = (((c)&0x1f)<<6)|__t1; 420 ++(i); 421 } else { 422 if(c >= 0xf0 || subchar > 0xffff) { 423 // We may read up to four bytes and write up to two UChars, 424 // which we didn't account for with computing count, 425 // so we adjust it here. 426 if(--count == 0) { 427 --i; // back out byte c 428 break; 429 } 430 } 431 432 /* function call for "complicated" and error cases */ 433 (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1); 434 if(c<0 && (++numSubstitutions, c = subchar) < 0) { 435 *pErrorCode = U_INVALID_CHAR_FOUND; 436 return NULL; 437 } else if(c<=0xFFFF) { 438 *(pDest++)=(UChar)c; 439 } else { 440 *(pDest++)=U16_LEAD(c); 441 *(pDest++)=U16_TRAIL(c); 442 } 443 } 444 } 445 } while(--count > 0); 446 } 447 448 while(i < srcLength && (pDest < pDestLimit)) { 449 // modified copy of U8_NEXT() 450 c = (uint8_t)src[i++]; 451 if(U8_IS_SINGLE(c)) { 452 *pDest++=(UChar)c; 453 } else { 454 uint8_t __t1, __t2; 455 if( /* handle U+0800..U+FFFF inline */ 456 (0xe0<=(c) && (c)<0xf0) && 457 ((i)+1)<srcLength && 458 U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && 459 (__t2=src[(i)+1]-0x80)<=0x3f) { 460 *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2; 461 i+=2; 462 } else if( /* handle U+0080..U+07FF inline */ 463 ((c)<0xe0 && (c)>=0xc2) && 464 ((i)!=srcLength) && 465 (__t1=src[i]-0x80)<=0x3f) { 466 *pDest++ = (((c)&0x1f)<<6)|__t1; 467 ++(i); 468 } else { 469 /* function call for "complicated" and error cases */ 470 (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1); 471 if(c<0 && (++numSubstitutions, c = subchar) < 0) { 472 *pErrorCode = U_INVALID_CHAR_FOUND; 473 return NULL; 474 } else if(c<=0xFFFF) { 475 *(pDest++)=(UChar)c; 476 } else { 477 *(pDest++)=U16_LEAD(c); 478 if(pDest<pDestLimit) { 479 *(pDest++)=U16_TRAIL(c); 480 } else { 481 reqLength++; 482 break; 483 } 484 } 485 } 486 } 487 } 488 489 /* Pre-flight the rest of the string. */ 490 while(i < srcLength) { 491 // modified copy of U8_NEXT() 492 c = (uint8_t)src[i++]; 493 if(U8_IS_SINGLE(c)) { 494 ++reqLength; 495 } else { 496 uint8_t __t1, __t2; 497 if( /* handle U+0800..U+FFFF inline */ 498 (0xe0<=(c) && (c)<0xf0) && 499 ((i)+1)<srcLength && 500 U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && 501 (__t2=src[(i)+1]-0x80)<=0x3f) { 502 ++reqLength; 503 i+=2; 504 } else if( /* handle U+0080..U+07FF inline */ 505 ((c)<0xe0 && (c)>=0xc2) && 506 ((i)!=srcLength) && 507 (__t1=src[i]-0x80)<=0x3f) { 508 ++reqLength; 509 ++(i); 510 } else { 511 /* function call for "complicated" and error cases */ 512 (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1); 513 if(c<0 && (++numSubstitutions, c = subchar) < 0) { 514 *pErrorCode = U_INVALID_CHAR_FOUND; 515 return NULL; 516 } 517 reqLength += U16_LENGTH(c); 518 } 519 } 520 } 521 } 522 523 reqLength+=(int32_t)(pDest - dest); 524 525 if(pNumSubstitutions!=NULL) { 526 *pNumSubstitutions=numSubstitutions; 527 } 528 529 if(pDestLength){ 530 *pDestLength = reqLength; 531 } 532 533 /* Terminate the buffer */ 534 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 535 536 return dest; 537 } 538 539 U_CAPI UChar* U_EXPORT2 540 u_strFromUTF8(UChar *dest, 541 int32_t destCapacity, 542 int32_t *pDestLength, 543 const char* src, 544 int32_t srcLength, 545 UErrorCode *pErrorCode){ 546 return u_strFromUTF8WithSub( 547 dest, destCapacity, pDestLength, 548 src, srcLength, 549 U_SENTINEL, NULL, 550 pErrorCode); 551 } 552 553 U_CAPI UChar * U_EXPORT2 554 u_strFromUTF8Lenient(UChar *dest, 555 int32_t destCapacity, 556 int32_t *pDestLength, 557 const char *src, 558 int32_t srcLength, 559 UErrorCode *pErrorCode) { 560 UChar *pDest = dest; 561 UChar32 ch; 562 int32_t reqLength = 0; 563 uint8_t* pSrc = (uint8_t*) src; 564 565 /* args check */ 566 if(U_FAILURE(*pErrorCode)){ 567 return NULL; 568 } 569 570 if( (src==NULL && srcLength!=0) || srcLength < -1 || 571 (destCapacity<0) || (dest == NULL && destCapacity > 0) 572 ) { 573 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 574 return NULL; 575 } 576 577 if(srcLength < 0) { 578 /* Transform a NUL-terminated string. */ 579 UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL; 580 uint8_t t1, t2, t3; /* trail bytes */ 581 582 while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { 583 if(ch < 0xc0) { 584 /* 585 * ASCII, or a trail byte in lead position which is treated like 586 * a single-byte sequence for better character boundary 587 * resynchronization after illegal sequences. 588 */ 589 *pDest++=(UChar)ch; 590 ++pSrc; 591 continue; 592 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 593 if((t1 = pSrc[1]) != 0) { 594 /* 0x3080 = (0xc0 << 6) + 0x80 */ 595 *pDest++ = (UChar)((ch << 6) + t1 - 0x3080); 596 pSrc += 2; 597 continue; 598 } 599 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 600 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) { 601 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 602 /* 0x2080 = (0x80 << 6) + 0x80 */ 603 *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080); 604 pSrc += 3; 605 continue; 606 } 607 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 608 if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) { 609 pSrc += 4; 610 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 611 ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080; 612 *(pDest++) = U16_LEAD(ch); 613 if(pDest < pDestLimit) { 614 *(pDest++) = U16_TRAIL(ch); 615 } else { 616 reqLength = 1; 617 break; 618 } 619 continue; 620 } 621 } 622 623 /* truncated character at the end */ 624 *pDest++ = 0xfffd; 625 while(*++pSrc != 0) {} 626 break; 627 } 628 629 /* Pre-flight the rest of the string. */ 630 while((ch = *pSrc) != 0) { 631 if(ch < 0xc0) { 632 /* 633 * ASCII, or a trail byte in lead position which is treated like 634 * a single-byte sequence for better character boundary 635 * resynchronization after illegal sequences. 636 */ 637 ++reqLength; 638 ++pSrc; 639 continue; 640 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 641 if(pSrc[1] != 0) { 642 ++reqLength; 643 pSrc += 2; 644 continue; 645 } 646 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 647 if(pSrc[1] != 0 && pSrc[2] != 0) { 648 ++reqLength; 649 pSrc += 3; 650 continue; 651 } 652 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 653 if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) { 654 reqLength += 2; 655 pSrc += 4; 656 continue; 657 } 658 } 659 660 /* truncated character at the end */ 661 ++reqLength; 662 break; 663 } 664 } else /* srcLength >= 0 */ { 665 const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL; 666 667 /* 668 * This function requires that if srcLength is given, then it must be 669 * destCapatity >= srcLength so that we need not check for 670 * destination buffer overflow in the loop. 671 */ 672 if(destCapacity < srcLength) { 673 if(pDestLength != NULL) { 674 *pDestLength = srcLength; /* this likely overestimates the true destLength! */ 675 } 676 *pErrorCode = U_BUFFER_OVERFLOW_ERROR; 677 return NULL; 678 } 679 680 if((pSrcLimit - pSrc) >= 4) { 681 pSrcLimit -= 3; /* temporarily reduce pSrcLimit */ 682 683 /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */ 684 do { 685 ch = *pSrc++; 686 if(ch < 0xc0) { 687 /* 688 * ASCII, or a trail byte in lead position which is treated like 689 * a single-byte sequence for better character boundary 690 * resynchronization after illegal sequences. 691 */ 692 *pDest++=(UChar)ch; 693 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 694 /* 0x3080 = (0xc0 << 6) + 0x80 */ 695 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); 696 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 697 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 698 /* 0x2080 = (0x80 << 6) + 0x80 */ 699 ch = (ch << 12) + (*pSrc++ << 6); 700 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); 701 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 702 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 703 ch = (ch << 18) + (*pSrc++ << 12); 704 ch += *pSrc++ << 6; 705 ch += *pSrc++ - 0x3c82080; 706 *(pDest++) = U16_LEAD(ch); 707 *(pDest++) = U16_TRAIL(ch); 708 } 709 } while(pSrc < pSrcLimit); 710 711 pSrcLimit += 3; /* restore original pSrcLimit */ 712 } 713 714 while(pSrc < pSrcLimit) { 715 ch = *pSrc++; 716 if(ch < 0xc0) { 717 /* 718 * ASCII, or a trail byte in lead position which is treated like 719 * a single-byte sequence for better character boundary 720 * resynchronization after illegal sequences. 721 */ 722 *pDest++=(UChar)ch; 723 continue; 724 } else if(ch < 0xe0) { /* U+0080..U+07FF */ 725 if(pSrc < pSrcLimit) { 726 /* 0x3080 = (0xc0 << 6) + 0x80 */ 727 *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); 728 continue; 729 } 730 } else if(ch < 0xf0) { /* U+0800..U+FFFF */ 731 if((pSrcLimit - pSrc) >= 2) { 732 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 733 /* 0x2080 = (0x80 << 6) + 0x80 */ 734 ch = (ch << 12) + (*pSrc++ << 6); 735 *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); 736 pSrc += 3; 737 continue; 738 } 739 } else /* f0..f4 */ { /* U+10000..U+10FFFF */ 740 if((pSrcLimit - pSrc) >= 3) { 741 /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ 742 ch = (ch << 18) + (*pSrc++ << 12); 743 ch += *pSrc++ << 6; 744 ch += *pSrc++ - 0x3c82080; 745 *(pDest++) = U16_LEAD(ch); 746 *(pDest++) = U16_TRAIL(ch); 747 pSrc += 4; 748 continue; 749 } 750 } 751 752 /* truncated character at the end */ 753 *pDest++ = 0xfffd; 754 break; 755 } 756 } 757 758 reqLength+=(int32_t)(pDest - dest); 759 760 if(pDestLength){ 761 *pDestLength = reqLength; 762 } 763 764 /* Terminate the buffer */ 765 u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); 766 767 return dest; 768 } 769 770 static inline uint8_t * 771 _appendUTF8(uint8_t *pDest, UChar32 c) { 772 /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */ 773 if((c)<=0x7f) { 774 *pDest++=(uint8_t)c; 775 } else if(c<=0x7ff) { 776 *pDest++=(uint8_t)((c>>6)|0xc0); 777 *pDest++=(uint8_t)((c&0x3f)|0x80); 778 } else if(c<=0xffff) { 779 *pDest++=(uint8_t)((c>>12)|0xe0); 780 *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80); 781 *pDest++=(uint8_t)(((c)&0x3f)|0x80); 782 } else /* if((uint32_t)(c)<=0x10ffff) */ { 783 *pDest++=(uint8_t)(((c)>>18)|0xf0); 784 *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80); 785 *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80); 786 *pDest++=(uint8_t)(((c)&0x3f)|0x80); 787 } 788 return pDest; 789 } 790 791 792 U_CAPI char* U_EXPORT2 793 u_strToUTF8WithSub(char *dest, 794 int32_t destCapacity, 795 int32_t *pDestLength, 796 const UChar *pSrc, 797 int32_t srcLength, 798 UChar32 subchar, int32_t *pNumSubstitutions, 799 UErrorCode *pErrorCode){ 800 int32_t reqLength=0; 801 uint32_t ch=0,ch2=0; 802 uint8_t *pDest = (uint8_t *)dest; 803 uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL; 804 int32_t numSubstitutions; 805 806 /* args check */ 807 if(U_FAILURE(*pErrorCode)){ 808 return NULL; 809 } 810 811 if( (pSrc==NULL && srcLength!=0) || srcLength < -1 || 812 (destCapacity<0) || (dest == NULL && destCapacity > 0) || 813 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 814 ) { 815 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 816 return NULL; 817 } 818 819 if(pNumSubstitutions!=NULL) { 820 *pNumSubstitutions=0; 821 } 822 numSubstitutions=0; 823 824 if(srcLength==-1) { 825 while((ch=*pSrc)!=0) { 826 ++pSrc; 827 if(ch <= 0x7f) { 828 if(pDest<pDestLimit) { 829 *pDest++ = (uint8_t)ch; 830 } else { 831 reqLength = 1; 832 break; 833 } 834 } else if(ch <= 0x7ff) { 835 if((pDestLimit - pDest) >= 2) { 836 *pDest++=(uint8_t)((ch>>6)|0xc0); 837 *pDest++=(uint8_t)((ch&0x3f)|0x80); 838 } else { 839 reqLength = 2; 840 break; 841 } 842 } else if(ch <= 0xd7ff || ch >= 0xe000) { 843 if((pDestLimit - pDest) >= 3) { 844 *pDest++=(uint8_t)((ch>>12)|0xe0); 845 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 846 *pDest++=(uint8_t)((ch&0x3f)|0x80); 847 } else { 848 reqLength = 3; 849 break; 850 } 851 } else /* ch is a surrogate */ { 852 int32_t length; 853 854 /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/ 855 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 856 ++pSrc; 857 ch=U16_GET_SUPPLEMENTARY(ch, ch2); 858 } else if(subchar>=0) { 859 ch=subchar; 860 ++numSubstitutions; 861 } else { 862 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 863 *pErrorCode = U_INVALID_CHAR_FOUND; 864 return NULL; 865 } 866 867 length = U8_LENGTH(ch); 868 if((pDestLimit - pDest) >= length) { 869 /* convert and append*/ 870 pDest=_appendUTF8(pDest, ch); 871 } else { 872 reqLength = length; 873 break; 874 } 875 } 876 } 877 while((ch=*pSrc++)!=0) { 878 if(ch<=0x7f) { 879 ++reqLength; 880 } else if(ch<=0x7ff) { 881 reqLength+=2; 882 } else if(!U16_IS_SURROGATE(ch)) { 883 reqLength+=3; 884 } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 885 ++pSrc; 886 reqLength+=4; 887 } else if(subchar>=0) { 888 reqLength+=U8_LENGTH(subchar); 889 ++numSubstitutions; 890 } else { 891 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 892 *pErrorCode = U_INVALID_CHAR_FOUND; 893 return NULL; 894 } 895 } 896 } else { 897 const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL; 898 int32_t count; 899 900 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 901 for(;;) { 902 /* 903 * Each iteration of the inner loop progresses by at most 3 UTF-8 904 * bytes and one UChar, for most characters. 905 * For supplementary code points (4 & 2), which are rare, 906 * there is an additional adjustment. 907 */ 908 count = (int32_t)((pDestLimit - pDest) / 3); 909 srcLength = (int32_t)(pSrcLimit - pSrc); 910 if(count > srcLength) { 911 count = srcLength; /* min(remaining dest/3, remaining src) */ 912 } 913 if(count < 3) { 914 /* 915 * Too much overhead if we get near the end of the string, 916 * continue with the next loop. 917 */ 918 break; 919 } 920 do { 921 ch=*pSrc++; 922 if(ch <= 0x7f) { 923 *pDest++ = (uint8_t)ch; 924 } else if(ch <= 0x7ff) { 925 *pDest++=(uint8_t)((ch>>6)|0xc0); 926 *pDest++=(uint8_t)((ch&0x3f)|0x80); 927 } else if(ch <= 0xd7ff || ch >= 0xe000) { 928 *pDest++=(uint8_t)((ch>>12)|0xe0); 929 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 930 *pDest++=(uint8_t)((ch&0x3f)|0x80); 931 } else /* ch is a surrogate */ { 932 /* 933 * We will read two UChars and probably output four bytes, 934 * which we didn't account for with computing count, 935 * so we adjust it here. 936 */ 937 if(--count == 0) { 938 --pSrc; /* undo ch=*pSrc++ for the lead surrogate */ 939 break; /* recompute count */ 940 } 941 942 if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { 943 ++pSrc; 944 ch=U16_GET_SUPPLEMENTARY(ch, ch2); 945 946 /* writing 4 bytes per 2 UChars is ok */ 947 *pDest++=(uint8_t)((ch>>18)|0xf0); 948 *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80); 949 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 950 *pDest++=(uint8_t)((ch&0x3f)|0x80); 951 } else { 952 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 953 if(subchar>=0) { 954 ch=subchar; 955 ++numSubstitutions; 956 } else { 957 *pErrorCode = U_INVALID_CHAR_FOUND; 958 return NULL; 959 } 960 961 /* convert and append*/ 962 pDest=_appendUTF8(pDest, ch); 963 } 964 } 965 } while(--count > 0); 966 } 967 968 while(pSrc<pSrcLimit) { 969 ch=*pSrc++; 970 if(ch <= 0x7f) { 971 if(pDest<pDestLimit) { 972 *pDest++ = (uint8_t)ch; 973 } else { 974 reqLength = 1; 975 break; 976 } 977 } else if(ch <= 0x7ff) { 978 if((pDestLimit - pDest) >= 2) { 979 *pDest++=(uint8_t)((ch>>6)|0xc0); 980 *pDest++=(uint8_t)((ch&0x3f)|0x80); 981 } else { 982 reqLength = 2; 983 break; 984 } 985 } else if(ch <= 0xd7ff || ch >= 0xe000) { 986 if((pDestLimit - pDest) >= 3) { 987 *pDest++=(uint8_t)((ch>>12)|0xe0); 988 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 989 *pDest++=(uint8_t)((ch&0x3f)|0x80); 990 } else { 991 reqLength = 3; 992 break; 993 } 994 } else /* ch is a surrogate */ { 995 int32_t length; 996 997 if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { 998 ++pSrc; 999 ch=U16_GET_SUPPLEMENTARY(ch, ch2); 1000 } else if(subchar>=0) { 1001 ch=subchar; 1002 ++numSubstitutions; 1003 } else { 1004 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1005 *pErrorCode = U_INVALID_CHAR_FOUND; 1006 return NULL; 1007 } 1008 1009 length = U8_LENGTH(ch); 1010 if((pDestLimit - pDest) >= length) { 1011 /* convert and append*/ 1012 pDest=_appendUTF8(pDest, ch); 1013 } else { 1014 reqLength = length; 1015 break; 1016 } 1017 } 1018 } 1019 while(pSrc<pSrcLimit) { 1020 ch=*pSrc++; 1021 if(ch<=0x7f) { 1022 ++reqLength; 1023 } else if(ch<=0x7ff) { 1024 reqLength+=2; 1025 } else if(!U16_IS_SURROGATE(ch)) { 1026 reqLength+=3; 1027 } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { 1028 ++pSrc; 1029 reqLength+=4; 1030 } else if(subchar>=0) { 1031 reqLength+=U8_LENGTH(subchar); 1032 ++numSubstitutions; 1033 } else { 1034 /* Unicode 3.2 forbids surrogate code points in UTF-8 */ 1035 *pErrorCode = U_INVALID_CHAR_FOUND; 1036 return NULL; 1037 } 1038 } 1039 } 1040 1041 reqLength+=(int32_t)(pDest - (uint8_t *)dest); 1042 1043 if(pNumSubstitutions!=NULL) { 1044 *pNumSubstitutions=numSubstitutions; 1045 } 1046 1047 if(pDestLength){ 1048 *pDestLength = reqLength; 1049 } 1050 1051 /* Terminate the buffer */ 1052 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1053 return dest; 1054 } 1055 1056 U_CAPI char* U_EXPORT2 1057 u_strToUTF8(char *dest, 1058 int32_t destCapacity, 1059 int32_t *pDestLength, 1060 const UChar *pSrc, 1061 int32_t srcLength, 1062 UErrorCode *pErrorCode){ 1063 return u_strToUTF8WithSub( 1064 dest, destCapacity, pDestLength, 1065 pSrc, srcLength, 1066 U_SENTINEL, NULL, 1067 pErrorCode); 1068 } 1069 1070 U_CAPI UChar* U_EXPORT2 1071 u_strFromJavaModifiedUTF8WithSub( 1072 UChar *dest, 1073 int32_t destCapacity, 1074 int32_t *pDestLength, 1075 const char *src, 1076 int32_t srcLength, 1077 UChar32 subchar, int32_t *pNumSubstitutions, 1078 UErrorCode *pErrorCode) { 1079 /* args check */ 1080 if(U_FAILURE(*pErrorCode)) { 1081 return NULL; 1082 } 1083 if( (src==NULL && srcLength!=0) || srcLength < -1 || 1084 (dest==NULL && destCapacity!=0) || destCapacity<0 || 1085 subchar > 0x10ffff || U_IS_SURROGATE(subchar) 1086 ) { 1087 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1088 return NULL; 1089 } 1090 1091 if(pNumSubstitutions!=NULL) { 1092 *pNumSubstitutions=0; 1093 } 1094 UChar *pDest = dest; 1095 UChar *pDestLimit = dest+destCapacity; 1096 int32_t reqLength = 0; 1097 int32_t numSubstitutions=0; 1098 1099 if(srcLength < 0) { 1100 /* 1101 * Transform a NUL-terminated ASCII string. 1102 * Handle non-ASCII strings with slower code. 1103 */ 1104 UChar32 c; 1105 while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) { 1106 *pDest++=(UChar)c; 1107 ++src; 1108 } 1109 if(c == 0) { 1110 reqLength=(int32_t)(pDest - dest); 1111 if(pDestLength) { 1112 *pDestLength = reqLength; 1113 } 1114 1115 /* Terminate the buffer */ 1116 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 1117 return dest; 1118 } 1119 srcLength = static_cast<int32_t>(uprv_strlen(src)); 1120 } 1121 1122 /* Faster loop without ongoing checking for srcLength and pDestLimit. */ 1123 UChar32 ch; 1124 uint8_t t1, t2; 1125 int32_t i = 0; 1126 for(;;) { 1127 int32_t count = (int32_t)(pDestLimit - pDest); 1128 int32_t count2 = srcLength - i; 1129 if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) { 1130 /* fast ASCII loop */ 1131 int32_t start = i; 1132 uint8_t b; 1133 while(i < srcLength && U8_IS_SINGLE(b = src[i])) { 1134 *pDest++=b; 1135 ++i; 1136 } 1137 int32_t delta = i - start; 1138 count -= delta; 1139 count2 -= delta; 1140 } 1141 /* 1142 * Each iteration of the inner loop progresses by at most 3 UTF-8 1143 * bytes and one UChar. 1144 */ 1145 if(subchar > 0xFFFF) { 1146 break; 1147 } 1148 count2 /= 3; 1149 if(count > count2) { 1150 count = count2; /* min(remaining dest, remaining src/3) */ 1151 } 1152 if(count < 3) { 1153 /* 1154 * Too much overhead if we get near the end of the string, 1155 * continue with the next loop. 1156 */ 1157 break; 1158 } 1159 do { 1160 ch = (uint8_t)src[i++]; 1161 if(U8_IS_SINGLE(ch)) { 1162 *pDest++=(UChar)ch; 1163 } else { 1164 if(ch >= 0xe0) { 1165 if( /* handle U+0000..U+FFFF inline */ 1166 ch <= 0xef && 1167 (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f && 1168 (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f 1169 ) { 1170 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1171 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 1172 i += 2; 1173 continue; 1174 } 1175 } else { 1176 if( /* handle U+0000..U+07FF inline */ 1177 ch >= 0xc0 && 1178 (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f 1179 ) { 1180 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 1181 ++i; 1182 continue; 1183 } 1184 } 1185 1186 if(subchar < 0) { 1187 *pErrorCode = U_INVALID_CHAR_FOUND; 1188 return NULL; 1189 } else if(subchar > 0xffff && --count == 0) { 1190 /* 1191 * We need to write two UChars, adjusted count for that, 1192 * and ran out of space. 1193 */ 1194 --i; // back out byte ch 1195 break; 1196 } else { 1197 /* function call for error cases */ 1198 utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1); 1199 ++numSubstitutions; 1200 *(pDest++)=(UChar)subchar; 1201 } 1202 } 1203 } while(--count > 0); 1204 } 1205 1206 while(i < srcLength && (pDest < pDestLimit)) { 1207 ch = (uint8_t)src[i++]; 1208 if(U8_IS_SINGLE(ch)){ 1209 *pDest++=(UChar)ch; 1210 } else { 1211 if(ch >= 0xe0) { 1212 if( /* handle U+0000..U+FFFF inline */ 1213 ch <= 0xef && 1214 (i+1) < srcLength && 1215 (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f && 1216 (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f 1217 ) { 1218 /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ 1219 *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); 1220 i += 2; 1221 continue; 1222 } 1223 } else { 1224 if( /* handle U+0000..U+07FF inline */ 1225 ch >= 0xc0 && 1226 i < srcLength && 1227 (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f 1228 ) { 1229 *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); 1230 ++i; 1231 continue; 1232 } 1233 } 1234 1235 if(subchar < 0) { 1236 *pErrorCode = U_INVALID_CHAR_FOUND; 1237 return NULL; 1238 } else { 1239 /* function call for error cases */ 1240 utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1); 1241 ++numSubstitutions; 1242 if(subchar<=0xFFFF) { 1243 *(pDest++)=(UChar)subchar; 1244 } else { 1245 *(pDest++)=U16_LEAD(subchar); 1246 if(pDest<pDestLimit) { 1247 *(pDest++)=U16_TRAIL(subchar); 1248 } else { 1249 reqLength++; 1250 break; 1251 } 1252 } 1253 } 1254 } 1255 } 1256 1257 /* Pre-flight the rest of the string. */ 1258 while(i < srcLength) { 1259 ch = (uint8_t)src[i++]; 1260 if(U8_IS_SINGLE(ch)) { 1261 reqLength++; 1262 } else { 1263 if(ch >= 0xe0) { 1264 if( /* handle U+0000..U+FFFF inline */ 1265 ch <= 0xef && 1266 (i+1) < srcLength && 1267 (uint8_t)(src[i] - 0x80) <= 0x3f && 1268 (uint8_t)(src[i+1] - 0x80) <= 0x3f 1269 ) { 1270 reqLength++; 1271 i += 2; 1272 continue; 1273 } 1274 } else { 1275 if( /* handle U+0000..U+07FF inline */ 1276 ch >= 0xc0 && 1277 i < srcLength && 1278 (uint8_t)(src[i] - 0x80) <= 0x3f 1279 ) { 1280 reqLength++; 1281 ++i; 1282 continue; 1283 } 1284 } 1285 1286 if(subchar < 0) { 1287 *pErrorCode = U_INVALID_CHAR_FOUND; 1288 return NULL; 1289 } else { 1290 /* function call for error cases */ 1291 utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1); 1292 ++numSubstitutions; 1293 reqLength+=U16_LENGTH(ch); 1294 } 1295 } 1296 } 1297 1298 if(pNumSubstitutions!=NULL) { 1299 *pNumSubstitutions=numSubstitutions; 1300 } 1301 1302 reqLength+=(int32_t)(pDest - dest); 1303 if(pDestLength) { 1304 *pDestLength = reqLength; 1305 } 1306 1307 /* Terminate the buffer */ 1308 u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); 1309 return dest; 1310 } 1311 1312 U_CAPI char* U_EXPORT2 1313 u_strToJavaModifiedUTF8( 1314 char *dest, 1315 int32_t destCapacity, 1316 int32_t *pDestLength, 1317 const UChar *src, 1318 int32_t srcLength, 1319 UErrorCode *pErrorCode) { 1320 int32_t reqLength=0; 1321 uint32_t ch=0; 1322 uint8_t *pDest = (uint8_t *)dest; 1323 uint8_t *pDestLimit = pDest + destCapacity; 1324 const UChar *pSrcLimit; 1325 int32_t count; 1326 1327 /* args check */ 1328 if(U_FAILURE(*pErrorCode)){ 1329 return NULL; 1330 } 1331 if( (src==NULL && srcLength!=0) || srcLength < -1 || 1332 (dest==NULL && destCapacity!=0) || destCapacity<0 1333 ) { 1334 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1335 return NULL; 1336 } 1337 1338 if(srcLength==-1) { 1339 /* Convert NUL-terminated ASCII, then find the string length. */ 1340 while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) { 1341 *pDest++ = (uint8_t)ch; 1342 ++src; 1343 } 1344 if(ch == 0) { 1345 reqLength=(int32_t)(pDest - (uint8_t *)dest); 1346 if(pDestLength) { 1347 *pDestLength = reqLength; 1348 } 1349 1350 /* Terminate the buffer */ 1351 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1352 return dest; 1353 } 1354 srcLength = u_strlen(src); 1355 } 1356 1357 /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ 1358 pSrcLimit = (src!=NULL)?(src+srcLength):NULL; 1359 for(;;) { 1360 count = (int32_t)(pDestLimit - pDest); 1361 srcLength = (int32_t)(pSrcLimit - src); 1362 if(count >= srcLength && srcLength > 0 && *src <= 0x7f) { 1363 /* fast ASCII loop */ 1364 const UChar *prevSrc = src; 1365 int32_t delta; 1366 while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) { 1367 *pDest++=(uint8_t)ch; 1368 ++src; 1369 } 1370 delta = (int32_t)(src - prevSrc); 1371 count -= delta; 1372 srcLength -= delta; 1373 } 1374 /* 1375 * Each iteration of the inner loop progresses by at most 3 UTF-8 1376 * bytes and one UChar. 1377 */ 1378 count /= 3; 1379 if(count > srcLength) { 1380 count = srcLength; /* min(remaining dest/3, remaining src) */ 1381 } 1382 if(count < 3) { 1383 /* 1384 * Too much overhead if we get near the end of the string, 1385 * continue with the next loop. 1386 */ 1387 break; 1388 } 1389 do { 1390 ch=*src++; 1391 if(ch <= 0x7f && ch != 0) { 1392 *pDest++ = (uint8_t)ch; 1393 } else if(ch <= 0x7ff) { 1394 *pDest++=(uint8_t)((ch>>6)|0xc0); 1395 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1396 } else { 1397 *pDest++=(uint8_t)((ch>>12)|0xe0); 1398 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1399 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1400 } 1401 } while(--count > 0); 1402 } 1403 1404 while(src<pSrcLimit) { 1405 ch=*src++; 1406 if(ch <= 0x7f && ch != 0) { 1407 if(pDest<pDestLimit) { 1408 *pDest++ = (uint8_t)ch; 1409 } else { 1410 reqLength = 1; 1411 break; 1412 } 1413 } else if(ch <= 0x7ff) { 1414 if((pDestLimit - pDest) >= 2) { 1415 *pDest++=(uint8_t)((ch>>6)|0xc0); 1416 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1417 } else { 1418 reqLength = 2; 1419 break; 1420 } 1421 } else { 1422 if((pDestLimit - pDest) >= 3) { 1423 *pDest++=(uint8_t)((ch>>12)|0xe0); 1424 *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); 1425 *pDest++=(uint8_t)((ch&0x3f)|0x80); 1426 } else { 1427 reqLength = 3; 1428 break; 1429 } 1430 } 1431 } 1432 while(src<pSrcLimit) { 1433 ch=*src++; 1434 if(ch <= 0x7f && ch != 0) { 1435 ++reqLength; 1436 } else if(ch<=0x7ff) { 1437 reqLength+=2; 1438 } else { 1439 reqLength+=3; 1440 } 1441 } 1442 1443 reqLength+=(int32_t)(pDest - (uint8_t *)dest); 1444 if(pDestLength){ 1445 *pDestLength = reqLength; 1446 } 1447 1448 /* Terminate the buffer */ 1449 u_terminateChars(dest, destCapacity, reqLength, pErrorCode); 1450 return dest; 1451 } 1452