1 /* 2 ****************************************************************************** 3 * 4 * Copyright (C) 1998-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ****************************************************************************** 8 * 9 * File ustring.h 10 * 11 * Modification History: 12 * 13 * Date Name Description 14 * 12/07/98 bertrand Creation. 15 ****************************************************************************** 16 */ 17 18 #include "unicode/utypes.h" 19 #include "unicode/putil.h" 20 #include "unicode/ustring.h" 21 #include "cstring.h" 22 #include "cwchar.h" 23 #include "cmemory.h" 24 #include "ustr_imp.h" 25 26 /* ANSI string.h - style functions ------------------------------------------ */ 27 28 /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */ 29 #define U_BMP_MAX 0xffff 30 31 /* Forward binary string search functions ----------------------------------- */ 32 33 /* 34 * Test if a substring match inside a string is at code point boundaries. 35 * All pointers refer to the same buffer. 36 * The limit pointer may be NULL, all others must be real pointers. 37 */ 38 static U_INLINE UBool 39 isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) { 40 if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) { 41 /* the leading edge of the match is in the middle of a surrogate pair */ 42 return FALSE; 43 } 44 if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) { 45 /* the trailing edge of the match is in the middle of a surrogate pair */ 46 return FALSE; 47 } 48 return TRUE; 49 } 50 51 U_CAPI UChar * U_EXPORT2 52 u_strFindFirst(const UChar *s, int32_t length, 53 const UChar *sub, int32_t subLength) { 54 const UChar *start, *p, *q, *subLimit; 55 UChar c, cs, cq; 56 57 if(sub==NULL || subLength<-1) { 58 return (UChar *)s; 59 } 60 if(s==NULL || length<-1) { 61 return NULL; 62 } 63 64 start=s; 65 66 if(length<0 && subLength<0) { 67 /* both strings are NUL-terminated */ 68 if((cs=*sub++)==0) { 69 return (UChar *)s; 70 } 71 if(*sub==0 && !U16_IS_SURROGATE(cs)) { 72 /* the substring consists of a single, non-surrogate BMP code point */ 73 return u_strchr(s, cs); 74 } 75 76 while((c=*s++)!=0) { 77 if(c==cs) { 78 /* found first substring UChar, compare rest */ 79 p=s; 80 q=sub; 81 for(;;) { 82 if((cq=*q)==0) { 83 if(isMatchAtCPBoundary(start, s-1, p, NULL)) { 84 return (UChar *)(s-1); /* well-formed match */ 85 } else { 86 break; /* no match because surrogate pair is split */ 87 } 88 } 89 if((c=*p)==0) { 90 return NULL; /* no match, and none possible after s */ 91 } 92 if(c!=cq) { 93 break; /* no match */ 94 } 95 ++p; 96 ++q; 97 } 98 } 99 } 100 101 /* not found */ 102 return NULL; 103 } 104 105 if(subLength<0) { 106 subLength=u_strlen(sub); 107 } 108 if(subLength==0) { 109 return (UChar *)s; 110 } 111 112 /* get sub[0] to search for it fast */ 113 cs=*sub++; 114 --subLength; 115 subLimit=sub+subLength; 116 117 if(subLength==0 && !U16_IS_SURROGATE(cs)) { 118 /* the substring consists of a single, non-surrogate BMP code point */ 119 return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length); 120 } 121 122 if(length<0) { 123 /* s is NUL-terminated */ 124 while((c=*s++)!=0) { 125 if(c==cs) { 126 /* found first substring UChar, compare rest */ 127 p=s; 128 q=sub; 129 for(;;) { 130 if(q==subLimit) { 131 if(isMatchAtCPBoundary(start, s-1, p, NULL)) { 132 return (UChar *)(s-1); /* well-formed match */ 133 } else { 134 break; /* no match because surrogate pair is split */ 135 } 136 } 137 if((c=*p)==0) { 138 return NULL; /* no match, and none possible after s */ 139 } 140 if(c!=*q) { 141 break; /* no match */ 142 } 143 ++p; 144 ++q; 145 } 146 } 147 } 148 } else { 149 const UChar *limit, *preLimit; 150 151 /* subLength was decremented above */ 152 if(length<=subLength) { 153 return NULL; /* s is shorter than sub */ 154 } 155 156 limit=s+length; 157 158 /* the substring must start before preLimit */ 159 preLimit=limit-subLength; 160 161 while(s!=preLimit) { 162 c=*s++; 163 if(c==cs) { 164 /* found first substring UChar, compare rest */ 165 p=s; 166 q=sub; 167 for(;;) { 168 if(q==subLimit) { 169 if(isMatchAtCPBoundary(start, s-1, p, limit)) { 170 return (UChar *)(s-1); /* well-formed match */ 171 } else { 172 break; /* no match because surrogate pair is split */ 173 } 174 } 175 if(*p!=*q) { 176 break; /* no match */ 177 } 178 ++p; 179 ++q; 180 } 181 } 182 } 183 } 184 185 /* not found */ 186 return NULL; 187 } 188 189 U_CAPI UChar * U_EXPORT2 190 u_strstr(const UChar *s, const UChar *substring) { 191 return u_strFindFirst(s, -1, substring, -1); 192 } 193 194 U_CAPI UChar * U_EXPORT2 195 u_strchr(const UChar *s, UChar c) { 196 if(U16_IS_SURROGATE(c)) { 197 /* make sure to not find half of a surrogate pair */ 198 return u_strFindFirst(s, -1, &c, 1); 199 } else { 200 UChar cs; 201 202 /* trivial search for a BMP code point */ 203 for(;;) { 204 if((cs=*s)==c) { 205 return (UChar *)s; 206 } 207 if(cs==0) { 208 return NULL; 209 } 210 ++s; 211 } 212 } 213 } 214 215 U_CAPI UChar * U_EXPORT2 216 u_strchr32(const UChar *s, UChar32 c) { 217 if((uint32_t)c<=U_BMP_MAX) { 218 /* find BMP code point */ 219 return u_strchr(s, (UChar)c); 220 } else if((uint32_t)c<=UCHAR_MAX_VALUE) { 221 /* find supplementary code point as surrogate pair */ 222 UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); 223 224 while((cs=*s++)!=0) { 225 if(cs==lead && *s==trail) { 226 return (UChar *)(s-1); 227 } 228 } 229 return NULL; 230 } else { 231 /* not a Unicode code point, not findable */ 232 return NULL; 233 } 234 } 235 236 U_CAPI UChar * U_EXPORT2 237 u_memchr(const UChar *s, UChar c, int32_t count) { 238 if(count<=0) { 239 return NULL; /* no string */ 240 } else if(U16_IS_SURROGATE(c)) { 241 /* make sure to not find half of a surrogate pair */ 242 return u_strFindFirst(s, count, &c, 1); 243 } else { 244 /* trivial search for a BMP code point */ 245 const UChar *limit=s+count; 246 do { 247 if(*s==c) { 248 return (UChar *)s; 249 } 250 } while(++s!=limit); 251 return NULL; 252 } 253 } 254 255 U_CAPI UChar * U_EXPORT2 256 u_memchr32(const UChar *s, UChar32 c, int32_t count) { 257 if((uint32_t)c<=U_BMP_MAX) { 258 /* find BMP code point */ 259 return u_memchr(s, (UChar)c, count); 260 } else if(count<2) { 261 /* too short for a surrogate pair */ 262 return NULL; 263 } else if((uint32_t)c<=UCHAR_MAX_VALUE) { 264 /* find supplementary code point as surrogate pair */ 265 const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */ 266 UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); 267 268 do { 269 if(*s==lead && *(s+1)==trail) { 270 return (UChar *)s; 271 } 272 } while(++s!=limit); 273 return NULL; 274 } else { 275 /* not a Unicode code point, not findable */ 276 return NULL; 277 } 278 } 279 280 /* Backward binary string search functions ---------------------------------- */ 281 282 U_CAPI UChar * U_EXPORT2 283 u_strFindLast(const UChar *s, int32_t length, 284 const UChar *sub, int32_t subLength) { 285 const UChar *start, *limit, *p, *q, *subLimit; 286 UChar c, cs; 287 288 if(sub==NULL || subLength<-1) { 289 return (UChar *)s; 290 } 291 if(s==NULL || length<-1) { 292 return NULL; 293 } 294 295 /* 296 * This implementation is more lazy than the one for u_strFindFirst(): 297 * There is no special search code for NUL-terminated strings. 298 * It does not seem to be worth it for searching substrings to 299 * search forward and find all matches like in u_strrchr() and similar. 300 * Therefore, we simply get both string lengths and search backward. 301 * 302 * markus 2002oct23 303 */ 304 305 if(subLength<0) { 306 subLength=u_strlen(sub); 307 } 308 if(subLength==0) { 309 return (UChar *)s; 310 } 311 312 /* get sub[subLength-1] to search for it fast */ 313 subLimit=sub+subLength; 314 cs=*(--subLimit); 315 --subLength; 316 317 if(subLength==0 && !U16_IS_SURROGATE(cs)) { 318 /* the substring consists of a single, non-surrogate BMP code point */ 319 return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length); 320 } 321 322 if(length<0) { 323 length=u_strlen(s); 324 } 325 326 /* subLength was decremented above */ 327 if(length<=subLength) { 328 return NULL; /* s is shorter than sub */ 329 } 330 331 start=s; 332 limit=s+length; 333 334 /* the substring must start no later than s+subLength */ 335 s+=subLength; 336 337 while(s!=limit) { 338 c=*(--limit); 339 if(c==cs) { 340 /* found last substring UChar, compare rest */ 341 p=limit; 342 q=subLimit; 343 for(;;) { 344 if(q==sub) { 345 if(isMatchAtCPBoundary(start, p, limit+1, start+length)) { 346 return (UChar *)p; /* well-formed match */ 347 } else { 348 break; /* no match because surrogate pair is split */ 349 } 350 } 351 if(*(--p)!=*(--q)) { 352 break; /* no match */ 353 } 354 } 355 } 356 } 357 358 /* not found */ 359 return NULL; 360 } 361 362 U_CAPI UChar * U_EXPORT2 363 u_strrstr(const UChar *s, const UChar *substring) { 364 return u_strFindLast(s, -1, substring, -1); 365 } 366 367 U_CAPI UChar * U_EXPORT2 368 u_strrchr(const UChar *s, UChar c) { 369 if(U16_IS_SURROGATE(c)) { 370 /* make sure to not find half of a surrogate pair */ 371 return u_strFindLast(s, -1, &c, 1); 372 } else { 373 const UChar *result=NULL; 374 UChar cs; 375 376 /* trivial search for a BMP code point */ 377 for(;;) { 378 if((cs=*s)==c) { 379 result=s; 380 } 381 if(cs==0) { 382 return (UChar *)result; 383 } 384 ++s; 385 } 386 } 387 } 388 389 U_CAPI UChar * U_EXPORT2 390 u_strrchr32(const UChar *s, UChar32 c) { 391 if((uint32_t)c<=U_BMP_MAX) { 392 /* find BMP code point */ 393 return u_strrchr(s, (UChar)c); 394 } else if((uint32_t)c<=UCHAR_MAX_VALUE) { 395 /* find supplementary code point as surrogate pair */ 396 const UChar *result=NULL; 397 UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); 398 399 while((cs=*s++)!=0) { 400 if(cs==lead && *s==trail) { 401 result=s-1; 402 } 403 } 404 return (UChar *)result; 405 } else { 406 /* not a Unicode code point, not findable */ 407 return NULL; 408 } 409 } 410 411 U_CAPI UChar * U_EXPORT2 412 u_memrchr(const UChar *s, UChar c, int32_t count) { 413 if(count<=0) { 414 return NULL; /* no string */ 415 } else if(U16_IS_SURROGATE(c)) { 416 /* make sure to not find half of a surrogate pair */ 417 return u_strFindLast(s, count, &c, 1); 418 } else { 419 /* trivial search for a BMP code point */ 420 const UChar *limit=s+count; 421 do { 422 if(*(--limit)==c) { 423 return (UChar *)limit; 424 } 425 } while(s!=limit); 426 return NULL; 427 } 428 } 429 430 U_CAPI UChar * U_EXPORT2 431 u_memrchr32(const UChar *s, UChar32 c, int32_t count) { 432 if((uint32_t)c<=U_BMP_MAX) { 433 /* find BMP code point */ 434 return u_memrchr(s, (UChar)c, count); 435 } else if(count<2) { 436 /* too short for a surrogate pair */ 437 return NULL; 438 } else if((uint32_t)c<=UCHAR_MAX_VALUE) { 439 /* find supplementary code point as surrogate pair */ 440 const UChar *limit=s+count-1; 441 UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); 442 443 do { 444 if(*limit==trail && *(limit-1)==lead) { 445 return (UChar *)(limit-1); 446 } 447 } while(s!=--limit); 448 return NULL; 449 } else { 450 /* not a Unicode code point, not findable */ 451 return NULL; 452 } 453 } 454 455 /* Tokenization functions --------------------------------------------------- */ 456 457 /* 458 * Match each code point in a string against each code point in the matchSet. 459 * Return the index of the first string code point that 460 * is (polarity==TRUE) or is not (FALSE) contained in the matchSet. 461 * Return -(string length)-1 if there is no such code point. 462 */ 463 static int32_t 464 _matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) { 465 int32_t matchLen, matchBMPLen, strItr, matchItr; 466 UChar32 stringCh, matchCh; 467 UChar c, c2; 468 469 /* first part of matchSet contains only BMP code points */ 470 matchBMPLen = 0; 471 while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) { 472 ++matchBMPLen; 473 } 474 475 /* second part of matchSet contains BMP and supplementary code points */ 476 matchLen = matchBMPLen; 477 while(matchSet[matchLen] != 0) { 478 ++matchLen; 479 } 480 481 for(strItr = 0; (c = string[strItr]) != 0;) { 482 ++strItr; 483 if(U16_IS_SINGLE(c)) { 484 if(polarity) { 485 for(matchItr = 0; matchItr < matchLen; ++matchItr) { 486 if(c == matchSet[matchItr]) { 487 return strItr - 1; /* one matches */ 488 } 489 } 490 } else { 491 for(matchItr = 0; matchItr < matchLen; ++matchItr) { 492 if(c == matchSet[matchItr]) { 493 goto endloop; 494 } 495 } 496 return strItr - 1; /* none matches */ 497 } 498 } else { 499 /* 500 * No need to check for string length before U16_IS_TRAIL 501 * because c2 could at worst be the terminating NUL. 502 */ 503 if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) { 504 ++strItr; 505 stringCh = U16_GET_SUPPLEMENTARY(c, c2); 506 } else { 507 stringCh = c; /* unpaired trail surrogate */ 508 } 509 510 if(polarity) { 511 for(matchItr = matchBMPLen; matchItr < matchLen;) { 512 U16_NEXT(matchSet, matchItr, matchLen, matchCh); 513 if(stringCh == matchCh) { 514 return strItr - U16_LENGTH(stringCh); /* one matches */ 515 } 516 } 517 } else { 518 for(matchItr = matchBMPLen; matchItr < matchLen;) { 519 U16_NEXT(matchSet, matchItr, matchLen, matchCh); 520 if(stringCh == matchCh) { 521 goto endloop; 522 } 523 } 524 return strItr - U16_LENGTH(stringCh); /* none matches */ 525 } 526 } 527 endloop: 528 /* wish C had continue with labels like Java... */; 529 } 530 531 /* Didn't find it. */ 532 return -strItr-1; 533 } 534 535 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */ 536 U_CAPI UChar * U_EXPORT2 537 u_strpbrk(const UChar *string, const UChar *matchSet) 538 { 539 int32_t idx = _matchFromSet(string, matchSet, TRUE); 540 if(idx >= 0) { 541 return (UChar *)string + idx; 542 } else { 543 return NULL; 544 } 545 } 546 547 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */ 548 U_CAPI int32_t U_EXPORT2 549 u_strcspn(const UChar *string, const UChar *matchSet) 550 { 551 int32_t idx = _matchFromSet(string, matchSet, TRUE); 552 if(idx >= 0) { 553 return idx; 554 } else { 555 return -idx - 1; /* == u_strlen(string) */ 556 } 557 } 558 559 /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */ 560 U_CAPI int32_t U_EXPORT2 561 u_strspn(const UChar *string, const UChar *matchSet) 562 { 563 int32_t idx = _matchFromSet(string, matchSet, FALSE); 564 if(idx >= 0) { 565 return idx; 566 } else { 567 return -idx - 1; /* == u_strlen(string) */ 568 } 569 } 570 571 /* ----- Text manipulation functions --- */ 572 573 U_CAPI UChar* U_EXPORT2 574 u_strtok_r(UChar *src, 575 const UChar *delim, 576 UChar **saveState) 577 { 578 UChar *tokSource; 579 UChar *nextToken; 580 uint32_t nonDelimIdx; 581 582 /* If saveState is NULL, the user messed up. */ 583 if (src != NULL) { 584 tokSource = src; 585 *saveState = src; /* Set to "src" in case there are no delimiters */ 586 } 587 else if (*saveState) { 588 tokSource = *saveState; 589 } 590 else { 591 /* src == NULL && *saveState == NULL */ 592 /* This shouldn't happen. We already finished tokenizing. */ 593 return NULL; 594 } 595 596 /* Skip initial delimiters */ 597 nonDelimIdx = u_strspn(tokSource, delim); 598 tokSource = &tokSource[nonDelimIdx]; 599 600 if (*tokSource) { 601 nextToken = u_strpbrk(tokSource, delim); 602 if (nextToken != NULL) { 603 /* Create a token */ 604 *(nextToken++) = 0; 605 *saveState = nextToken; 606 return tokSource; 607 } 608 else if (*saveState) { 609 /* Return the last token */ 610 *saveState = NULL; 611 return tokSource; 612 } 613 } 614 else { 615 /* No tokens were found. Only delimiters were left. */ 616 *saveState = NULL; 617 } 618 return NULL; 619 } 620 621 /* Miscellaneous functions -------------------------------------------------- */ 622 623 U_CAPI UChar* U_EXPORT2 624 u_strcat(UChar *dst, 625 const UChar *src) 626 { 627 UChar *anchor = dst; /* save a pointer to start of dst */ 628 629 while(*dst != 0) { /* To end of first string */ 630 ++dst; 631 } 632 while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ 633 } 634 635 return anchor; 636 } 637 638 U_CAPI UChar* U_EXPORT2 639 u_strncat(UChar *dst, 640 const UChar *src, 641 int32_t n ) 642 { 643 if(n > 0) { 644 UChar *anchor = dst; /* save a pointer to start of dst */ 645 646 while(*dst != 0) { /* To end of first string */ 647 ++dst; 648 } 649 while((*dst = *src) != 0) { /* copy string 2 over */ 650 ++dst; 651 if(--n == 0) { 652 *dst = 0; 653 break; 654 } 655 ++src; 656 } 657 658 return anchor; 659 } else { 660 return dst; 661 } 662 } 663 664 /* ----- Text property functions --- */ 665 666 U_CAPI int32_t U_EXPORT2 667 u_strcmp(const UChar *s1, 668 const UChar *s2) 669 { 670 UChar c1, c2; 671 672 for(;;) { 673 c1=*s1++; 674 c2=*s2++; 675 if (c1 != c2 || c1 == 0) { 676 break; 677 } 678 } 679 return (int32_t)c1 - (int32_t)c2; 680 } 681 682 U_CFUNC int32_t U_EXPORT2 683 uprv_strCompare(const UChar *s1, int32_t length1, 684 const UChar *s2, int32_t length2, 685 UBool strncmpStyle, UBool codePointOrder) { 686 const UChar *start1, *start2, *limit1, *limit2; 687 UChar c1, c2; 688 689 /* setup for fix-up */ 690 start1=s1; 691 start2=s2; 692 693 /* compare identical prefixes - they do not need to be fixed up */ 694 if(length1<0 && length2<0) { 695 /* strcmp style, both NUL-terminated */ 696 if(s1==s2) { 697 return 0; 698 } 699 700 for(;;) { 701 c1=*s1; 702 c2=*s2; 703 if(c1!=c2) { 704 break; 705 } 706 if(c1==0) { 707 return 0; 708 } 709 ++s1; 710 ++s2; 711 } 712 713 /* setup for fix-up */ 714 limit1=limit2=NULL; 715 } else if(strncmpStyle) { 716 /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */ 717 if(s1==s2) { 718 return 0; 719 } 720 721 limit1=start1+length1; 722 723 for(;;) { 724 /* both lengths are same, check only one limit */ 725 if(s1==limit1) { 726 return 0; 727 } 728 729 c1=*s1; 730 c2=*s2; 731 if(c1!=c2) { 732 break; 733 } 734 if(c1==0) { 735 return 0; 736 } 737 ++s1; 738 ++s2; 739 } 740 741 /* setup for fix-up */ 742 limit2=start2+length1; /* use length1 here, too, to enforce assumption */ 743 } else { 744 /* memcmp/UnicodeString style, both length-specified */ 745 int32_t lengthResult; 746 747 if(length1<0) { 748 length1=u_strlen(s1); 749 } 750 if(length2<0) { 751 length2=u_strlen(s2); 752 } 753 754 /* limit1=start1+min(lenght1, length2) */ 755 if(length1<length2) { 756 lengthResult=-1; 757 limit1=start1+length1; 758 } else if(length1==length2) { 759 lengthResult=0; 760 limit1=start1+length1; 761 } else /* length1>length2 */ { 762 lengthResult=1; 763 limit1=start1+length2; 764 } 765 766 if(s1==s2) { 767 return lengthResult; 768 } 769 770 for(;;) { 771 /* check pseudo-limit */ 772 if(s1==limit1) { 773 return lengthResult; 774 } 775 776 c1=*s1; 777 c2=*s2; 778 if(c1!=c2) { 779 break; 780 } 781 ++s1; 782 ++s2; 783 } 784 785 /* setup for fix-up */ 786 limit1=start1+length1; 787 limit2=start2+length2; 788 } 789 790 /* if both values are in or above the surrogate range, fix them up */ 791 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { 792 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ 793 if( 794 (c1<=0xdbff && (s1+1)!=limit1 && UTF_IS_TRAIL(*(s1+1))) || 795 (UTF_IS_TRAIL(c1) && start1!=s1 && UTF_IS_LEAD(*(s1-1))) 796 ) { 797 /* part of a surrogate pair, leave >=d800 */ 798 } else { 799 /* BMP code point - may be surrogate code point - make <d800 */ 800 c1-=0x2800; 801 } 802 803 if( 804 (c2<=0xdbff && (s2+1)!=limit2 && UTF_IS_TRAIL(*(s2+1))) || 805 (UTF_IS_TRAIL(c2) && start2!=s2 && UTF_IS_LEAD(*(s2-1))) 806 ) { 807 /* part of a surrogate pair, leave >=d800 */ 808 } else { 809 /* BMP code point - may be surrogate code point - make <d800 */ 810 c2-=0x2800; 811 } 812 } 813 814 /* now c1 and c2 are in the requested (code unit or code point) order */ 815 return (int32_t)c1-(int32_t)c2; 816 } 817 818 /* 819 * Compare two strings as presented by UCharIterators. 820 * Use code unit or code point order. 821 * When the function returns, it is undefined where the iterators 822 * have stopped. 823 */ 824 U_CAPI int32_t U_EXPORT2 825 u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) { 826 UChar32 c1, c2; 827 828 /* argument checking */ 829 if(iter1==NULL || iter2==NULL) { 830 return 0; /* bad arguments */ 831 } 832 if(iter1==iter2) { 833 return 0; /* identical iterators */ 834 } 835 836 /* reset iterators to start? */ 837 iter1->move(iter1, 0, UITER_START); 838 iter2->move(iter2, 0, UITER_START); 839 840 /* compare identical prefixes - they do not need to be fixed up */ 841 for(;;) { 842 c1=iter1->next(iter1); 843 c2=iter2->next(iter2); 844 if(c1!=c2) { 845 break; 846 } 847 if(c1==-1) { 848 return 0; 849 } 850 } 851 852 /* if both values are in or above the surrogate range, fix them up */ 853 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { 854 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ 855 if( 856 (c1<=0xdbff && UTF_IS_TRAIL(iter1->current(iter1))) || 857 (UTF_IS_TRAIL(c1) && (iter1->previous(iter1), UTF_IS_LEAD(iter1->previous(iter1)))) 858 ) { 859 /* part of a surrogate pair, leave >=d800 */ 860 } else { 861 /* BMP code point - may be surrogate code point - make <d800 */ 862 c1-=0x2800; 863 } 864 865 if( 866 (c2<=0xdbff && UTF_IS_TRAIL(iter2->current(iter2))) || 867 (UTF_IS_TRAIL(c2) && (iter2->previous(iter2), UTF_IS_LEAD(iter2->previous(iter2)))) 868 ) { 869 /* part of a surrogate pair, leave >=d800 */ 870 } else { 871 /* BMP code point - may be surrogate code point - make <d800 */ 872 c2-=0x2800; 873 } 874 } 875 876 /* now c1 and c2 are in the requested (code unit or code point) order */ 877 return (int32_t)c1-(int32_t)c2; 878 } 879 880 #if 0 881 /* 882 * u_strCompareIter() does not leave the iterators _on_ the different units. 883 * This is possible but would cost a few extra indirect function calls to back 884 * up if the last unit (c1 or c2 respectively) was >=0. 885 * 886 * Consistently leaving them _behind_ the different units is not an option 887 * because the current "unit" is the end of the string if that is reached, 888 * and in such a case the iterator does not move. 889 * For example, when comparing "ab" with "abc", both iterators rest _on_ the end 890 * of their strings. Calling previous() on each does not move them to where 891 * the comparison fails. 892 * 893 * So the simplest semantics is to not define where the iterators end up. 894 * 895 * The following fragment is part of what would need to be done for backing up. 896 */ 897 void fragment { 898 /* iff a surrogate is part of a surrogate pair, leave >=d800 */ 899 if(c1<=0xdbff) { 900 if(!UTF_IS_TRAIL(iter1->current(iter1))) { 901 /* lead surrogate code point - make <d800 */ 902 c1-=0x2800; 903 } 904 } else if(c1<=0xdfff) { 905 int32_t idx=iter1->getIndex(iter1, UITER_CURRENT); 906 iter1->previous(iter1); /* ==c1 */ 907 if(!UTF_IS_LEAD(iter1->previous(iter1))) { 908 /* trail surrogate code point - make <d800 */ 909 c1-=0x2800; 910 } 911 /* go back to behind where the difference is */ 912 iter1->move(iter1, idx, UITER_ZERO); 913 } else /* 0xe000<=c1<=0xffff */ { 914 /* BMP code point - make <d800 */ 915 c1-=0x2800; 916 } 917 } 918 #endif 919 920 U_CAPI int32_t U_EXPORT2 921 u_strCompare(const UChar *s1, int32_t length1, 922 const UChar *s2, int32_t length2, 923 UBool codePointOrder) { 924 /* argument checking */ 925 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { 926 return 0; 927 } 928 return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder); 929 } 930 931 /* String compare in code point order - u_strcmp() compares in code unit order. */ 932 U_CAPI int32_t U_EXPORT2 933 u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) { 934 return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE); 935 } 936 937 U_CAPI int32_t U_EXPORT2 938 u_strncmp(const UChar *s1, 939 const UChar *s2, 940 int32_t n) 941 { 942 if(n > 0) { 943 int32_t rc; 944 for(;;) { 945 rc = (int32_t)*s1 - (int32_t)*s2; 946 if(rc != 0 || *s1 == 0 || --n == 0) { 947 return rc; 948 } 949 ++s1; 950 ++s2; 951 } 952 } else { 953 return 0; 954 } 955 } 956 957 U_CAPI int32_t U_EXPORT2 958 u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) { 959 return uprv_strCompare(s1, n, s2, n, TRUE, TRUE); 960 } 961 962 U_CAPI UChar* U_EXPORT2 963 u_strcpy(UChar *dst, 964 const UChar *src) 965 { 966 UChar *anchor = dst; /* save a pointer to start of dst */ 967 968 while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ 969 } 970 971 return anchor; 972 } 973 974 U_CAPI UChar* U_EXPORT2 975 u_strncpy(UChar *dst, 976 const UChar *src, 977 int32_t n) 978 { 979 UChar *anchor = dst; /* save a pointer to start of dst */ 980 981 /* copy string 2 over */ 982 while(n > 0 && (*(dst++) = *(src++)) != 0) { 983 --n; 984 } 985 986 return anchor; 987 } 988 989 U_CAPI int32_t U_EXPORT2 990 u_strlen(const UChar *s) 991 { 992 #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR 993 return (int32_t)uprv_wcslen(s); 994 #else 995 const UChar *t = s; 996 while(*t != 0) { 997 ++t; 998 } 999 return t - s; 1000 #endif 1001 } 1002 1003 U_CAPI int32_t U_EXPORT2 1004 u_countChar32(const UChar *s, int32_t length) { 1005 int32_t count; 1006 1007 if(s==NULL || length<-1) { 1008 return 0; 1009 } 1010 1011 count=0; 1012 if(length>=0) { 1013 while(length>0) { 1014 ++count; 1015 if(UTF_IS_LEAD(*s) && length>=2 && UTF_IS_TRAIL(*(s+1))) { 1016 s+=2; 1017 length-=2; 1018 } else { 1019 ++s; 1020 --length; 1021 } 1022 } 1023 } else /* length==-1 */ { 1024 UChar c; 1025 1026 for(;;) { 1027 if((c=*s++)==0) { 1028 break; 1029 } 1030 ++count; 1031 1032 /* 1033 * sufficient to look ahead one because of UTF-16; 1034 * safe to look ahead one because at worst that would be the terminating NUL 1035 */ 1036 if(UTF_IS_LEAD(c) && UTF_IS_TRAIL(*s)) { 1037 ++s; 1038 } 1039 } 1040 } 1041 return count; 1042 } 1043 1044 U_CAPI UBool U_EXPORT2 1045 u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) { 1046 1047 if(number<0) { 1048 return TRUE; 1049 } 1050 if(s==NULL || length<-1) { 1051 return FALSE; 1052 } 1053 1054 if(length==-1) { 1055 /* s is NUL-terminated */ 1056 UChar c; 1057 1058 /* count code points until they exceed */ 1059 for(;;) { 1060 if((c=*s++)==0) { 1061 return FALSE; 1062 } 1063 if(number==0) { 1064 return TRUE; 1065 } 1066 if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { 1067 ++s; 1068 } 1069 --number; 1070 } 1071 } else { 1072 /* length>=0 known */ 1073 const UChar *limit; 1074 int32_t maxSupplementary; 1075 1076 /* s contains at least (length+1)/2 code points: <=2 UChars per cp */ 1077 if(((length+1)/2)>number) { 1078 return TRUE; 1079 } 1080 1081 /* check if s does not even contain enough UChars */ 1082 maxSupplementary=length-number; 1083 if(maxSupplementary<=0) { 1084 return FALSE; 1085 } 1086 /* there are maxSupplementary=length-number more UChars than asked-for code points */ 1087 1088 /* 1089 * count code points until they exceed and also check that there are 1090 * no more than maxSupplementary supplementary code points (UChar pairs) 1091 */ 1092 limit=s+length; 1093 for(;;) { 1094 if(s==limit) { 1095 return FALSE; 1096 } 1097 if(number==0) { 1098 return TRUE; 1099 } 1100 if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) { 1101 ++s; 1102 if(--maxSupplementary<=0) { 1103 /* too many pairs - too few code points */ 1104 return FALSE; 1105 } 1106 } 1107 --number; 1108 } 1109 } 1110 } 1111 1112 U_CAPI UChar * U_EXPORT2 1113 u_memcpy(UChar *dest, const UChar *src, int32_t count) { 1114 return (UChar *)uprv_memcpy(dest, src, count*U_SIZEOF_UCHAR); 1115 } 1116 1117 U_CAPI UChar * U_EXPORT2 1118 u_memmove(UChar *dest, const UChar *src, int32_t count) { 1119 return (UChar *)uprv_memmove(dest, src, count*U_SIZEOF_UCHAR); 1120 } 1121 1122 U_CAPI UChar * U_EXPORT2 1123 u_memset(UChar *dest, UChar c, int32_t count) { 1124 if(count > 0) { 1125 UChar *ptr = dest; 1126 UChar *limit = dest + count; 1127 1128 while (ptr < limit) { 1129 *(ptr++) = c; 1130 } 1131 } 1132 return dest; 1133 } 1134 1135 U_CAPI int32_t U_EXPORT2 1136 u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) { 1137 if(count > 0) { 1138 const UChar *limit = buf1 + count; 1139 int32_t result; 1140 1141 while (buf1 < limit) { 1142 result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2; 1143 if (result != 0) { 1144 return result; 1145 } 1146 buf1++; 1147 buf2++; 1148 } 1149 } 1150 return 0; 1151 } 1152 1153 U_CAPI int32_t U_EXPORT2 1154 u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) { 1155 return uprv_strCompare(s1, count, s2, count, FALSE, TRUE); 1156 } 1157 1158 /* u_unescape & support fns ------------------------------------------------- */ 1159 1160 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ 1161 static const UChar UNESCAPE_MAP[] = { 1162 /*" 0x22, 0x22 */ 1163 /*' 0x27, 0x27 */ 1164 /*? 0x3F, 0x3F */ 1165 /*\ 0x5C, 0x5C */ 1166 /*a*/ 0x61, 0x07, 1167 /*b*/ 0x62, 0x08, 1168 /*e*/ 0x65, 0x1b, 1169 /*f*/ 0x66, 0x0c, 1170 /*n*/ 0x6E, 0x0a, 1171 /*r*/ 0x72, 0x0d, 1172 /*t*/ 0x74, 0x09, 1173 /*v*/ 0x76, 0x0b 1174 }; 1175 enum { UNESCAPE_MAP_LENGTH = sizeof(UNESCAPE_MAP) / sizeof(UNESCAPE_MAP[0]) }; 1176 1177 /* Convert one octal digit to a numeric value 0..7, or -1 on failure */ 1178 static int8_t _digit8(UChar c) { 1179 if (c >= 0x0030 && c <= 0x0037) { 1180 return (int8_t)(c - 0x0030); 1181 } 1182 return -1; 1183 } 1184 1185 /* Convert one hex digit to a numeric value 0..F, or -1 on failure */ 1186 static int8_t _digit16(UChar c) { 1187 if (c >= 0x0030 && c <= 0x0039) { 1188 return (int8_t)(c - 0x0030); 1189 } 1190 if (c >= 0x0041 && c <= 0x0046) { 1191 return (int8_t)(c - (0x0041 - 10)); 1192 } 1193 if (c >= 0x0061 && c <= 0x0066) { 1194 return (int8_t)(c - (0x0061 - 10)); 1195 } 1196 return -1; 1197 } 1198 1199 /* Parse a single escape sequence. Although this method deals in 1200 * UChars, it does not use C++ or UnicodeString. This allows it to 1201 * be used from C contexts. */ 1202 U_CAPI UChar32 U_EXPORT2 1203 u_unescapeAt(UNESCAPE_CHAR_AT charAt, 1204 int32_t *offset, 1205 int32_t length, 1206 void *context) { 1207 1208 int32_t start = *offset; 1209 UChar c; 1210 UChar32 result = 0; 1211 int8_t n = 0; 1212 int8_t minDig = 0; 1213 int8_t maxDig = 0; 1214 int8_t bitsPerDigit = 4; 1215 int8_t dig; 1216 int32_t i; 1217 UBool braces = FALSE; 1218 1219 /* Check that offset is in range */ 1220 if (*offset < 0 || *offset >= length) { 1221 goto err; 1222 } 1223 1224 /* Fetch first UChar after '\\' */ 1225 c = charAt((*offset)++, context); 1226 1227 /* Convert hexadecimal and octal escapes */ 1228 switch (c) { 1229 case 0x0075 /*'u'*/: 1230 minDig = maxDig = 4; 1231 break; 1232 case 0x0055 /*'U'*/: 1233 minDig = maxDig = 8; 1234 break; 1235 case 0x0078 /*'x'*/: 1236 minDig = 1; 1237 if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) { 1238 ++(*offset); 1239 braces = TRUE; 1240 maxDig = 8; 1241 } else { 1242 maxDig = 2; 1243 } 1244 break; 1245 default: 1246 dig = _digit8(c); 1247 if (dig >= 0) { 1248 minDig = 1; 1249 maxDig = 3; 1250 n = 1; /* Already have first octal digit */ 1251 bitsPerDigit = 3; 1252 result = dig; 1253 } 1254 break; 1255 } 1256 if (minDig != 0) { 1257 while (*offset < length && n < maxDig) { 1258 c = charAt(*offset, context); 1259 dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c)); 1260 if (dig < 0) { 1261 break; 1262 } 1263 result = (result << bitsPerDigit) | dig; 1264 ++(*offset); 1265 ++n; 1266 } 1267 if (n < minDig) { 1268 goto err; 1269 } 1270 if (braces) { 1271 if (c != 0x7D /*}*/) { 1272 goto err; 1273 } 1274 ++(*offset); 1275 } 1276 if (result < 0 || result >= 0x110000) { 1277 goto err; 1278 } 1279 /* If an escape sequence specifies a lead surrogate, see if 1280 * there is a trail surrogate after it, either as an escape or 1281 * as a literal. If so, join them up into a supplementary. 1282 */ 1283 if (*offset < length && U16_IS_LEAD(result)) { 1284 int32_t ahead = *offset + 1; 1285 c = charAt(*offset, context); 1286 if (c == 0x5C /*'\\'*/ && ahead < length) { 1287 c = (UChar) u_unescapeAt(charAt, &ahead, length, context); 1288 } 1289 if (U16_IS_TRAIL(c)) { 1290 *offset = ahead; 1291 result = U16_GET_SUPPLEMENTARY(result, c); 1292 } 1293 } 1294 return result; 1295 } 1296 1297 /* Convert C-style escapes in table */ 1298 for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) { 1299 if (c == UNESCAPE_MAP[i]) { 1300 return UNESCAPE_MAP[i+1]; 1301 } else if (c < UNESCAPE_MAP[i]) { 1302 break; 1303 } 1304 } 1305 1306 /* Map \cX to control-X: X & 0x1F */ 1307 if (c == 0x0063 /*'c'*/ && *offset < length) { 1308 c = charAt((*offset)++, context); 1309 if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) { 1310 UChar c2 = charAt(*offset, context); 1311 if (UTF_IS_SECOND_SURROGATE(c2)) { 1312 ++(*offset); 1313 c = (UChar) UTF16_GET_PAIR_VALUE(c, c2); /* [sic] */ 1314 } 1315 } 1316 return 0x1F & c; 1317 } 1318 1319 /* If no special forms are recognized, then consider 1320 * the backslash to generically escape the next character. 1321 * Deal with surrogate pairs. */ 1322 if (UTF_IS_FIRST_SURROGATE(c) && *offset < length) { 1323 UChar c2 = charAt(*offset, context); 1324 if (UTF_IS_SECOND_SURROGATE(c2)) { 1325 ++(*offset); 1326 return UTF16_GET_PAIR_VALUE(c, c2); 1327 } 1328 } 1329 return c; 1330 1331 err: 1332 /* Invalid escape sequence */ 1333 *offset = start; /* Reset to initial value */ 1334 return (UChar32)0xFFFFFFFF; 1335 } 1336 1337 /* u_unescapeAt() callback to return a UChar from a char* */ 1338 static UChar U_CALLCONV 1339 _charPtr_charAt(int32_t offset, void *context) { 1340 UChar c16; 1341 /* It would be more efficient to access the invariant tables 1342 * directly but there is no API for that. */ 1343 u_charsToUChars(((char*) context) + offset, &c16, 1); 1344 return c16; 1345 } 1346 1347 /* Append an escape-free segment of the text; used by u_unescape() */ 1348 static void _appendUChars(UChar *dest, int32_t destCapacity, 1349 const char *src, int32_t srcLen) { 1350 if (destCapacity < 0) { 1351 destCapacity = 0; 1352 } 1353 if (srcLen > destCapacity) { 1354 srcLen = destCapacity; 1355 } 1356 u_charsToUChars(src, dest, srcLen); 1357 } 1358 1359 /* Do an invariant conversion of char* -> UChar*, with escape parsing */ 1360 U_CAPI int32_t U_EXPORT2 1361 u_unescape(const char *src, UChar *dest, int32_t destCapacity) { 1362 const char *segment = src; 1363 int32_t i = 0; 1364 char c; 1365 1366 while ((c=*src) != 0) { 1367 /* '\\' intentionally written as compiler-specific 1368 * character constant to correspond to compiler-specific 1369 * char* constants. */ 1370 if (c == '\\') { 1371 int32_t lenParsed = 0; 1372 UChar32 c32; 1373 if (src != segment) { 1374 if (dest != NULL) { 1375 _appendUChars(dest + i, destCapacity - i, 1376 segment, (int32_t)(src - segment)); 1377 } 1378 i += (int32_t)(src - segment); 1379 } 1380 ++src; /* advance past '\\' */ 1381 c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src); 1382 if (lenParsed == 0) { 1383 goto err; 1384 } 1385 src += lenParsed; /* advance past escape seq. */ 1386 if (dest != NULL && UTF_CHAR_LENGTH(c32) <= (destCapacity - i)) { 1387 UTF_APPEND_CHAR_UNSAFE(dest, i, c32); 1388 } else { 1389 i += UTF_CHAR_LENGTH(c32); 1390 } 1391 segment = src; 1392 } else { 1393 ++src; 1394 } 1395 } 1396 if (src != segment) { 1397 if (dest != NULL) { 1398 _appendUChars(dest + i, destCapacity - i, 1399 segment, (int32_t)(src - segment)); 1400 } 1401 i += (int32_t)(src - segment); 1402 } 1403 if (dest != NULL && i < destCapacity) { 1404 dest[i] = 0; 1405 } 1406 return i; 1407 1408 err: 1409 if (dest != NULL && destCapacity > 0) { 1410 *dest = 0; 1411 } 1412 return 0; 1413 } 1414 1415 /* NUL-termination of strings ----------------------------------------------- */ 1416 1417 /** 1418 * NUL-terminate a string no matter what its type. 1419 * Set warning and error codes accordingly. 1420 */ 1421 #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \ 1422 if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \ 1423 /* not a public function, so no complete argument checking */ \ 1424 \ 1425 if(length<0) { \ 1426 /* assume that the caller handles this */ \ 1427 } else if(length<destCapacity) { \ 1428 /* NUL-terminate the string, the NUL fits */ \ 1429 dest[length]=0; \ 1430 /* unset the not-terminated warning but leave all others */ \ 1431 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \ 1432 *pErrorCode=U_ZERO_ERROR; \ 1433 } \ 1434 } else if(length==destCapacity) { \ 1435 /* unable to NUL-terminate, but the string itself fit - set a warning code */ \ 1436 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \ 1437 } else /* length>destCapacity */ { \ 1438 /* even the string itself did not fit - set an error code */ \ 1439 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \ 1440 } \ 1441 } 1442 1443 U_CAPI int32_t U_EXPORT2 1444 u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { 1445 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); 1446 return length; 1447 } 1448 1449 U_CAPI int32_t U_EXPORT2 1450 u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { 1451 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); 1452 return length; 1453 } 1454 1455 U_CAPI int32_t U_EXPORT2 1456 u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { 1457 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); 1458 return length; 1459 } 1460 1461 U_CAPI int32_t U_EXPORT2 1462 u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { 1463 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); 1464 return length; 1465 } 1466