1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2001-2009, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ustrcase.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002feb20 14 * created by: Markus W. Scherer 15 * 16 * Implementation file for string casing C API functions. 17 * Uses functions from uchar.c for basic functionality that requires access 18 * to the Unicode Character Database (uprops.dat). 19 */ 20 21 #include "unicode/utypes.h" 22 #include "unicode/uloc.h" 23 #include "unicode/ustring.h" 24 #include "unicode/ucasemap.h" 25 #include "unicode/ubrk.h" 26 #include "cmemory.h" 27 #include "ucase.h" 28 #include "unormimp.h" 29 #include "ustr_imp.h" 30 31 /* string casing ------------------------------------------------------------ */ 32 33 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */ 34 static U_INLINE int32_t 35 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity, 36 int32_t result, const UChar *s) { 37 UChar32 c; 38 int32_t length; 39 40 /* decode the result */ 41 if(result<0) { 42 /* (not) original code point */ 43 c=~result; 44 length=-1; 45 } else if(result<=UCASE_MAX_STRING_LENGTH) { 46 c=U_SENTINEL; 47 length=result; 48 } else { 49 c=result; 50 length=-1; 51 } 52 53 if(destIndex<destCapacity) { 54 /* append the result */ 55 if(length<0) { 56 /* code point */ 57 UBool isError=FALSE; 58 U16_APPEND(dest, destIndex, destCapacity, c, isError); 59 if(isError) { 60 /* overflow, nothing written */ 61 destIndex+=U16_LENGTH(c); 62 } 63 } else { 64 /* string */ 65 if((destIndex+length)<=destCapacity) { 66 while(length>0) { 67 dest[destIndex++]=*s++; 68 --length; 69 } 70 } else { 71 /* overflow */ 72 destIndex+=length; 73 } 74 } 75 } else { 76 /* preflight */ 77 if(length<0) { 78 destIndex+=U16_LENGTH(c); 79 } else { 80 destIndex+=length; 81 } 82 } 83 return destIndex; 84 } 85 86 static UChar32 U_CALLCONV 87 utf16_caseContextIterator(void *context, int8_t dir) { 88 UCaseContext *csc=(UCaseContext *)context; 89 UChar32 c; 90 91 if(dir<0) { 92 /* reset for backward iteration */ 93 csc->index=csc->cpStart; 94 csc->dir=dir; 95 } else if(dir>0) { 96 /* reset for forward iteration */ 97 csc->index=csc->cpLimit; 98 csc->dir=dir; 99 } else { 100 /* continue current iteration direction */ 101 dir=csc->dir; 102 } 103 104 if(dir<0) { 105 if(csc->start<csc->index) { 106 U16_PREV((const UChar *)csc->p, csc->start, csc->index, c); 107 return c; 108 } 109 } else { 110 if(csc->index<csc->limit) { 111 U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c); 112 return c; 113 } 114 } 115 return U_SENTINEL; 116 } 117 118 /* 119 * Case-maps [srcStart..srcLimit[ but takes 120 * context [0..srcLength[ into account. 121 */ 122 static int32_t 123 _caseMap(const UCaseMap *csm, UCaseMapFull *map, 124 UChar *dest, int32_t destCapacity, 125 const UChar *src, UCaseContext *csc, 126 int32_t srcStart, int32_t srcLimit, 127 UErrorCode *pErrorCode) { 128 const UChar *s; 129 UChar32 c, c2 = 0; 130 int32_t srcIndex, destIndex; 131 int32_t locCache; 132 133 locCache=csm->locCache; 134 135 /* case mapping loop */ 136 srcIndex=srcStart; 137 destIndex=0; 138 while(srcIndex<srcLimit) { 139 csc->cpStart=srcIndex; 140 U16_NEXT(src, srcIndex, srcLimit, c); 141 csc->cpLimit=srcIndex; 142 c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache); 143 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) { 144 /* fast path version of appendResult() for BMP results */ 145 dest[destIndex++]=(UChar)c2; 146 } else { 147 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 148 } 149 } 150 151 if(destIndex>destCapacity) { 152 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 153 } 154 return destIndex; 155 } 156 157 static void 158 setTempCaseMapLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { 159 /* 160 * We could call ucasemap_setLocale(), but here we really only care about 161 * the initial language subtag, we need not return the real string via 162 * ucasemap_getLocale(), and we don't care about only getting "x" from 163 * "x-some-thing" etc. 164 * 165 * We ignore locales with a longer-than-3 initial subtag. 166 * 167 * We also do not fill in the locCache because it is rarely used, 168 * and not worth setting unless we reuse it for many case mapping operations. 169 * (That's why UCaseMap was created.) 170 */ 171 int i; 172 char c; 173 174 /* the internal functions require locale!=NULL */ 175 if(locale==NULL) { 176 locale=uloc_getDefault(); 177 } 178 for(i=0; i<4 && (c=locale[i])!=0 && c!='-' && c!='_'; ++i) { 179 csm->locale[i]=c; 180 } 181 if(i<=3) { 182 csm->locale[i]=0; /* Up to 3 non-separator characters. */ 183 } else { 184 csm->locale[0]=0; /* Longer-than-3 initial subtag: Ignore. */ 185 } 186 } 187 188 /* 189 * Set parameters on an empty UCaseMap, for UCaseMap-less API functions. 190 * Do this fast because it is called with every function call. 191 */ 192 static U_INLINE void 193 setTempCaseMap(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { 194 if(csm->csp==NULL) { 195 csm->csp=ucase_getSingleton(pErrorCode); 196 if(U_FAILURE(*pErrorCode)) { 197 return; 198 } 199 } 200 if(locale!=NULL && locale[0]==0) { 201 csm->locale[0]=0; 202 } else { 203 setTempCaseMapLocale(csm, locale, pErrorCode); 204 } 205 } 206 207 #if !UCONFIG_NO_BREAK_ITERATION 208 209 /* 210 * Internal titlecasing function. 211 */ 212 static int32_t 213 _toTitle(UCaseMap *csm, 214 UChar *dest, int32_t destCapacity, 215 const UChar *src, UCaseContext *csc, 216 int32_t srcLength, 217 UErrorCode *pErrorCode) { 218 const UChar *s; 219 UChar32 c; 220 int32_t prev, titleStart, titleLimit, idx, destIndex, length; 221 UBool isFirstIndex; 222 223 if(csm->iter!=NULL) { 224 ubrk_setText(csm->iter, src, srcLength, pErrorCode); 225 } else { 226 csm->iter=ubrk_open(UBRK_WORD, csm->locale, 227 src, srcLength, 228 pErrorCode); 229 } 230 if(U_FAILURE(*pErrorCode)) { 231 return 0; 232 } 233 234 /* set up local variables */ 235 destIndex=0; 236 prev=0; 237 isFirstIndex=TRUE; 238 239 /* titlecasing loop */ 240 while(prev<srcLength) { 241 /* find next index where to titlecase */ 242 if(isFirstIndex) { 243 isFirstIndex=FALSE; 244 idx=ubrk_first(csm->iter); 245 } else { 246 idx=ubrk_next(csm->iter); 247 } 248 if(idx==UBRK_DONE || idx>srcLength) { 249 idx=srcLength; 250 } 251 252 /* 253 * Unicode 4 & 5 section 3.13 Default Case Operations: 254 * 255 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex 256 * #29, "Text Boundaries." Between each pair of word boundaries, find the first 257 * cased character F. If F exists, map F to default_title(F); then map each 258 * subsequent character C to default_lower(C). 259 * 260 * In this implementation, segment [prev..index[ into 3 parts: 261 * a) uncased characters (copy as-is) [prev..titleStart[ 262 * b) first case letter (titlecase) [titleStart..titleLimit[ 263 * c) subsequent characters (lowercase) [titleLimit..index[ 264 */ 265 if(prev<idx) { 266 /* find and copy uncased characters [prev..titleStart[ */ 267 titleStart=titleLimit=prev; 268 U16_NEXT(src, titleLimit, idx, c); 269 if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { 270 /* Adjust the titlecasing index (titleStart) to the next cased character. */ 271 for(;;) { 272 titleStart=titleLimit; 273 if(titleLimit==idx) { 274 /* 275 * only uncased characters in [prev..index[ 276 * stop with titleStart==titleLimit==index 277 */ 278 break; 279 } 280 U16_NEXT(src, titleLimit, idx, c); 281 if(UCASE_NONE!=ucase_getType(csm->csp, c)) { 282 break; /* cased letter at [titleStart..titleLimit[ */ 283 } 284 } 285 length=titleStart-prev; 286 if(length>0) { 287 if((destIndex+length)<=destCapacity) { 288 uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR); 289 } 290 destIndex+=length; 291 } 292 } 293 294 if(titleStart<titleLimit) { 295 /* titlecase c which is from [titleStart..titleLimit[ */ 296 csc->cpStart=titleStart; 297 csc->cpLimit=titleLimit; 298 c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &csm->locCache); 299 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 300 301 /* Special case Dutch IJ titlecasing */ 302 if ( titleStart+1 < idx && 303 ucase_getCaseLocale(csm->locale,&csm->locCache) == UCASE_LOC_DUTCH && 304 ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) && 305 ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) { 306 c=(UChar32) 0x004A; 307 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 308 titleLimit++; 309 } 310 311 /* lowercase [titleLimit..index[ */ 312 if(titleLimit<idx) { 313 if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) { 314 /* Normal operation: Lowercase the rest of the word. */ 315 destIndex+= 316 _caseMap( 317 csm, ucase_toFullLower, 318 dest+destIndex, destCapacity-destIndex, 319 src, csc, 320 titleLimit, idx, 321 pErrorCode); 322 } else { 323 /* Optionally just copy the rest of the word unchanged. */ 324 length=idx-titleLimit; 325 if((destIndex+length)<=destCapacity) { 326 uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR); 327 } 328 destIndex+=length; 329 } 330 } 331 } 332 } 333 334 prev=idx; 335 } 336 337 if(destIndex>destCapacity) { 338 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 339 } 340 return destIndex; 341 } 342 343 #endif 344 345 /* functions available in the common library (for unistr_case.cpp) */ 346 347 U_CFUNC int32_t 348 ustr_toLower(const UCaseProps *csp, 349 UChar *dest, int32_t destCapacity, 350 const UChar *src, int32_t srcLength, 351 const char *locale, 352 UErrorCode *pErrorCode) { 353 UCaseMap csm={ NULL }; 354 UCaseContext csc={ NULL }; 355 356 csm.csp=csp; 357 setTempCaseMap(&csm, locale, pErrorCode); 358 csc.p=(void *)src; 359 csc.limit=srcLength; 360 361 return _caseMap(&csm, ucase_toFullLower, 362 dest, destCapacity, 363 src, &csc, 0, srcLength, 364 pErrorCode); 365 } 366 367 U_CFUNC int32_t 368 ustr_toUpper(const UCaseProps *csp, 369 UChar *dest, int32_t destCapacity, 370 const UChar *src, int32_t srcLength, 371 const char *locale, 372 UErrorCode *pErrorCode) { 373 UCaseMap csm={ NULL }; 374 UCaseContext csc={ NULL }; 375 376 csm.csp=csp; 377 setTempCaseMap(&csm, locale, pErrorCode); 378 csc.p=(void *)src; 379 csc.limit=srcLength; 380 381 return _caseMap(&csm, ucase_toFullUpper, 382 dest, destCapacity, 383 src, &csc, 0, srcLength, 384 pErrorCode); 385 } 386 387 #if !UCONFIG_NO_BREAK_ITERATION 388 389 U_CFUNC int32_t 390 ustr_toTitle(const UCaseProps *csp, 391 UChar *dest, int32_t destCapacity, 392 const UChar *src, int32_t srcLength, 393 UBreakIterator *titleIter, 394 const char *locale, uint32_t options, 395 UErrorCode *pErrorCode) { 396 UCaseMap csm={ NULL }; 397 UCaseContext csc={ NULL }; 398 int32_t length; 399 400 csm.csp=csp; 401 csm.iter=titleIter; 402 csm.options=options; 403 setTempCaseMap(&csm, locale, pErrorCode); 404 csc.p=(void *)src; 405 csc.limit=srcLength; 406 407 length=_toTitle(&csm, 408 dest, destCapacity, 409 src, &csc, srcLength, 410 pErrorCode); 411 if(titleIter==NULL && csm.iter!=NULL) { 412 ubrk_close(csm.iter); 413 } 414 return length; 415 } 416 417 #endif 418 419 U_CFUNC int32_t 420 ustr_foldCase(const UCaseProps *csp, 421 UChar *dest, int32_t destCapacity, 422 const UChar *src, int32_t srcLength, 423 uint32_t options, 424 UErrorCode *pErrorCode) { 425 int32_t srcIndex, destIndex; 426 427 const UChar *s; 428 UChar32 c, c2 = 0; 429 430 /* case mapping loop */ 431 srcIndex=destIndex=0; 432 while(srcIndex<srcLength) { 433 U16_NEXT(src, srcIndex, srcLength, c); 434 c=ucase_toFullFolding(csp, c, &s, options); 435 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) { 436 /* fast path version of appendResult() for BMP results */ 437 dest[destIndex++]=(UChar)c2; 438 } else { 439 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 440 } 441 } 442 443 if(destIndex>destCapacity) { 444 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 445 } 446 return destIndex; 447 } 448 449 /* 450 * Implement argument checking and buffer handling 451 * for string case mapping as a common function. 452 */ 453 454 /* common internal function for public API functions */ 455 456 static int32_t 457 caseMap(const UCaseMap *csm, 458 UChar *dest, int32_t destCapacity, 459 const UChar *src, int32_t srcLength, 460 int32_t toWhichCase, 461 UErrorCode *pErrorCode) { 462 UChar buffer[300]; 463 UChar *temp; 464 465 int32_t destLength; 466 467 /* check argument values */ 468 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 469 return 0; 470 } 471 if( destCapacity<0 || 472 (dest==NULL && destCapacity>0) || 473 src==NULL || 474 srcLength<-1 475 ) { 476 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 477 return 0; 478 } 479 480 /* get the string length */ 481 if(srcLength==-1) { 482 srcLength=u_strlen(src); 483 } 484 485 /* check for overlapping source and destination */ 486 if( dest!=NULL && 487 ((src>=dest && src<(dest+destCapacity)) || 488 (dest>=src && dest<(src+srcLength))) 489 ) { 490 /* overlap: provide a temporary destination buffer and later copy the result */ 491 if(destCapacity<=(sizeof(buffer)/U_SIZEOF_UCHAR)) { 492 /* the stack buffer is large enough */ 493 temp=buffer; 494 } else { 495 /* allocate a buffer */ 496 temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR); 497 if(temp==NULL) { 498 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 499 return 0; 500 } 501 } 502 } else { 503 temp=dest; 504 } 505 506 destLength=0; 507 508 if(toWhichCase==FOLD_CASE) { 509 destLength=ustr_foldCase(csm->csp, temp, destCapacity, src, srcLength, 510 csm->options, pErrorCode); 511 } else { 512 UCaseContext csc={ NULL }; 513 514 csc.p=(void *)src; 515 csc.limit=srcLength; 516 517 if(toWhichCase==TO_LOWER) { 518 destLength=_caseMap(csm, ucase_toFullLower, 519 temp, destCapacity, 520 src, &csc, 521 0, srcLength, 522 pErrorCode); 523 } else if(toWhichCase==TO_UPPER) { 524 destLength=_caseMap(csm, ucase_toFullUpper, 525 temp, destCapacity, 526 src, &csc, 527 0, srcLength, 528 pErrorCode); 529 } else /* if(toWhichCase==TO_TITLE) */ { 530 #if UCONFIG_NO_BREAK_ITERATION 531 *pErrorCode=U_UNSUPPORTED_ERROR; 532 #else 533 /* UCaseMap is actually non-const in toTitle() APIs. */ 534 destLength=_toTitle((UCaseMap *)csm, temp, destCapacity, 535 src, &csc, srcLength, 536 pErrorCode); 537 #endif 538 } 539 } 540 if(temp!=dest) { 541 /* copy the result string to the destination buffer */ 542 if(destLength>0) { 543 int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity; 544 if(copyLength>0) { 545 uprv_memmove(dest, temp, copyLength*U_SIZEOF_UCHAR); 546 } 547 } 548 if(temp!=buffer) { 549 uprv_free(temp); 550 } 551 } 552 553 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); 554 } 555 556 /* public API functions */ 557 558 U_CAPI int32_t U_EXPORT2 559 u_strToLower(UChar *dest, int32_t destCapacity, 560 const UChar *src, int32_t srcLength, 561 const char *locale, 562 UErrorCode *pErrorCode) { 563 UCaseMap csm={ NULL }; 564 setTempCaseMap(&csm, locale, pErrorCode); 565 return caseMap(&csm, 566 dest, destCapacity, 567 src, srcLength, 568 TO_LOWER, pErrorCode); 569 } 570 571 U_CAPI int32_t U_EXPORT2 572 u_strToUpper(UChar *dest, int32_t destCapacity, 573 const UChar *src, int32_t srcLength, 574 const char *locale, 575 UErrorCode *pErrorCode) { 576 UCaseMap csm={ NULL }; 577 setTempCaseMap(&csm, locale, pErrorCode); 578 return caseMap(&csm, 579 dest, destCapacity, 580 src, srcLength, 581 TO_UPPER, pErrorCode); 582 } 583 584 #if !UCONFIG_NO_BREAK_ITERATION 585 586 U_CAPI int32_t U_EXPORT2 587 u_strToTitle(UChar *dest, int32_t destCapacity, 588 const UChar *src, int32_t srcLength, 589 UBreakIterator *titleIter, 590 const char *locale, 591 UErrorCode *pErrorCode) { 592 UCaseMap csm={ NULL }; 593 int32_t length; 594 595 csm.iter=titleIter; 596 setTempCaseMap(&csm, locale, pErrorCode); 597 length=caseMap(&csm, 598 dest, destCapacity, 599 src, srcLength, 600 TO_TITLE, pErrorCode); 601 if(titleIter==NULL && csm.iter!=NULL) { 602 ubrk_close(csm.iter); 603 } 604 return length; 605 } 606 607 U_CAPI int32_t U_EXPORT2 608 ucasemap_toTitle(UCaseMap *csm, 609 UChar *dest, int32_t destCapacity, 610 const UChar *src, int32_t srcLength, 611 UErrorCode *pErrorCode) { 612 return caseMap(csm, 613 dest, destCapacity, 614 src, srcLength, 615 TO_TITLE, pErrorCode); 616 } 617 618 #endif 619 620 U_CAPI int32_t U_EXPORT2 621 u_strFoldCase(UChar *dest, int32_t destCapacity, 622 const UChar *src, int32_t srcLength, 623 uint32_t options, 624 UErrorCode *pErrorCode) { 625 UCaseMap csm={ NULL }; 626 csm.csp=ucase_getSingleton(pErrorCode); 627 csm.options=options; 628 return caseMap(&csm, 629 dest, destCapacity, 630 src, srcLength, 631 FOLD_CASE, pErrorCode); 632 } 633 634 /* case-insensitive string comparisons -------------------------------------- */ 635 636 /* 637 * This function is a copy of unorm_cmpEquivFold() minus the parts for 638 * canonical equivalence. 639 * Keep the functions in sync, and see there for how this works. 640 * The duplication is for modularization: 641 * It makes caseless (but not canonical caseless) matches independent of 642 * the normalization code. 643 */ 644 645 /* stack element for previous-level source/decomposition pointers */ 646 struct CmpEquivLevel { 647 const UChar *start, *s, *limit; 648 }; 649 typedef struct CmpEquivLevel CmpEquivLevel; 650 651 /* internal function */ 652 U_CFUNC int32_t 653 u_strcmpFold(const UChar *s1, int32_t length1, 654 const UChar *s2, int32_t length2, 655 uint32_t options, 656 UErrorCode *pErrorCode) { 657 const UCaseProps *csp; 658 659 /* current-level start/limit - s1/s2 as current */ 660 const UChar *start1, *start2, *limit1, *limit2; 661 662 /* case folding variables */ 663 const UChar *p; 664 int32_t length; 665 666 /* stacks of previous-level start/current/limit */ 667 CmpEquivLevel stack1[2], stack2[2]; 668 669 /* case folding buffers, only use current-level start/limit */ 670 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; 671 672 /* track which is the current level per string */ 673 int32_t level1, level2; 674 675 /* current code units, and code points for lookups */ 676 UChar32 c1, c2, cp1, cp2; 677 678 /* no argument error checking because this itself is not an API */ 679 680 /* 681 * assume that at least the option U_COMPARE_IGNORE_CASE is set 682 * otherwise this function would have to behave exactly as uprv_strCompare() 683 */ 684 csp=ucase_getSingleton(pErrorCode); 685 if(U_FAILURE(*pErrorCode)) { 686 return 0; 687 } 688 689 /* initialize */ 690 start1=s1; 691 if(length1==-1) { 692 limit1=NULL; 693 } else { 694 limit1=s1+length1; 695 } 696 697 start2=s2; 698 if(length2==-1) { 699 limit2=NULL; 700 } else { 701 limit2=s2+length2; 702 } 703 704 level1=level2=0; 705 c1=c2=-1; 706 707 /* comparison loop */ 708 for(;;) { 709 /* 710 * here a code unit value of -1 means "get another code unit" 711 * below it will mean "this source is finished" 712 */ 713 714 if(c1<0) { 715 /* get next code unit from string 1, post-increment */ 716 for(;;) { 717 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { 718 if(level1==0) { 719 c1=-1; 720 break; 721 } 722 } else { 723 ++s1; 724 break; 725 } 726 727 /* reached end of level buffer, pop one level */ 728 do { 729 --level1; 730 start1=stack1[level1].start; 731 } while(start1==NULL); 732 s1=stack1[level1].s; 733 limit1=stack1[level1].limit; 734 } 735 } 736 737 if(c2<0) { 738 /* get next code unit from string 2, post-increment */ 739 for(;;) { 740 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { 741 if(level2==0) { 742 c2=-1; 743 break; 744 } 745 } else { 746 ++s2; 747 break; 748 } 749 750 /* reached end of level buffer, pop one level */ 751 do { 752 --level2; 753 start2=stack2[level2].start; 754 } while(start2==NULL); 755 s2=stack2[level2].s; 756 limit2=stack2[level2].limit; 757 } 758 } 759 760 /* 761 * compare c1 and c2 762 * either variable c1, c2 is -1 only if the corresponding string is finished 763 */ 764 if(c1==c2) { 765 if(c1<0) { 766 return 0; /* c1==c2==-1 indicating end of strings */ 767 } 768 c1=c2=-1; /* make us fetch new code units */ 769 continue; 770 } else if(c1<0) { 771 return -1; /* string 1 ends before string 2 */ 772 } else if(c2<0) { 773 return 1; /* string 2 ends before string 1 */ 774 } 775 /* c1!=c2 && c1>=0 && c2>=0 */ 776 777 /* get complete code points for c1, c2 for lookups if either is a surrogate */ 778 cp1=c1; 779 if(U_IS_SURROGATE(c1)) { 780 UChar c; 781 782 if(U_IS_SURROGATE_LEAD(c1)) { 783 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { 784 /* advance ++s1; only below if cp1 decomposes/case-folds */ 785 cp1=U16_GET_SUPPLEMENTARY(c1, c); 786 } 787 } else /* isTrail(c1) */ { 788 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { 789 cp1=U16_GET_SUPPLEMENTARY(c, c1); 790 } 791 } 792 } 793 794 cp2=c2; 795 if(U_IS_SURROGATE(c2)) { 796 UChar c; 797 798 if(U_IS_SURROGATE_LEAD(c2)) { 799 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { 800 /* advance ++s2; only below if cp2 decomposes/case-folds */ 801 cp2=U16_GET_SUPPLEMENTARY(c2, c); 802 } 803 } else /* isTrail(c2) */ { 804 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { 805 cp2=U16_GET_SUPPLEMENTARY(c, c2); 806 } 807 } 808 } 809 810 /* 811 * go down one level for each string 812 * continue with the main loop as soon as there is a real change 813 */ 814 815 if( level1==0 && 816 (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0 817 ) { 818 /* cp1 case-folds to the code point "length" or to p[length] */ 819 if(U_IS_SURROGATE(c1)) { 820 if(U_IS_SURROGATE_LEAD(c1)) { 821 /* advance beyond source surrogate pair if it case-folds */ 822 ++s1; 823 } else /* isTrail(c1) */ { 824 /* 825 * we got a supplementary code point when hitting its trail surrogate, 826 * therefore the lead surrogate must have been the same as in the other string; 827 * compare this decomposition with the lead surrogate in the other string 828 * remember that this simulates bulk text replacement: 829 * the decomposition would replace the entire code point 830 */ 831 --s2; 832 c2=*(s2-1); 833 } 834 } 835 836 /* push current level pointers */ 837 stack1[0].start=start1; 838 stack1[0].s=s1; 839 stack1[0].limit=limit1; 840 ++level1; 841 842 /* copy the folding result to fold1[] */ 843 if(length<=UCASE_MAX_STRING_LENGTH) { 844 u_memcpy(fold1, p, length); 845 } else { 846 int32_t i=0; 847 U16_APPEND_UNSAFE(fold1, i, length); 848 length=i; 849 } 850 851 /* set next level pointers to case folding */ 852 start1=s1=fold1; 853 limit1=fold1+length; 854 855 /* get ready to read from decomposition, continue with loop */ 856 c1=-1; 857 continue; 858 } 859 860 if( level2==0 && 861 (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0 862 ) { 863 /* cp2 case-folds to the code point "length" or to p[length] */ 864 if(U_IS_SURROGATE(c2)) { 865 if(U_IS_SURROGATE_LEAD(c2)) { 866 /* advance beyond source surrogate pair if it case-folds */ 867 ++s2; 868 } else /* isTrail(c2) */ { 869 /* 870 * we got a supplementary code point when hitting its trail surrogate, 871 * therefore the lead surrogate must have been the same as in the other string; 872 * compare this decomposition with the lead surrogate in the other string 873 * remember that this simulates bulk text replacement: 874 * the decomposition would replace the entire code point 875 */ 876 --s1; 877 c1=*(s1-1); 878 } 879 } 880 881 /* push current level pointers */ 882 stack2[0].start=start2; 883 stack2[0].s=s2; 884 stack2[0].limit=limit2; 885 ++level2; 886 887 /* copy the folding result to fold2[] */ 888 if(length<=UCASE_MAX_STRING_LENGTH) { 889 u_memcpy(fold2, p, length); 890 } else { 891 int32_t i=0; 892 U16_APPEND_UNSAFE(fold2, i, length); 893 length=i; 894 } 895 896 /* set next level pointers to case folding */ 897 start2=s2=fold2; 898 limit2=fold2+length; 899 900 /* get ready to read from decomposition, continue with loop */ 901 c2=-1; 902 continue; 903 } 904 905 /* 906 * no decomposition/case folding, max level for both sides: 907 * return difference result 908 * 909 * code point order comparison must not just return cp1-cp2 910 * because when single surrogates are present then the surrogate pairs 911 * that formed cp1 and cp2 may be from different string indexes 912 * 913 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units 914 * c1=d800 cp1=10001 c2=dc00 cp2=10000 915 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } 916 * 917 * therefore, use same fix-up as in ustring.c/uprv_strCompare() 918 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ 919 * so we have slightly different pointer/start/limit comparisons here 920 */ 921 922 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { 923 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ 924 if( 925 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || 926 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) 927 ) { 928 /* part of a surrogate pair, leave >=d800 */ 929 } else { 930 /* BMP code point - may be surrogate code point - make <d800 */ 931 c1-=0x2800; 932 } 933 934 if( 935 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || 936 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) 937 ) { 938 /* part of a surrogate pair, leave >=d800 */ 939 } else { 940 /* BMP code point - may be surrogate code point - make <d800 */ 941 c2-=0x2800; 942 } 943 } 944 945 return c1-c2; 946 } 947 } 948 949 /* public API functions */ 950 951 U_CAPI int32_t U_EXPORT2 952 u_strCaseCompare(const UChar *s1, int32_t length1, 953 const UChar *s2, int32_t length2, 954 uint32_t options, 955 UErrorCode *pErrorCode) { 956 /* argument checking */ 957 if(pErrorCode==0 || U_FAILURE(*pErrorCode)) { 958 return 0; 959 } 960 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { 961 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 962 return 0; 963 } 964 return u_strcmpFold(s1, length1, s2, length2, 965 options|U_COMPARE_IGNORE_CASE, 966 pErrorCode); 967 } 968 969 U_CAPI int32_t U_EXPORT2 970 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) { 971 UErrorCode errorCode=U_ZERO_ERROR; 972 return u_strcmpFold(s1, -1, s2, -1, 973 options|U_COMPARE_IGNORE_CASE, 974 &errorCode); 975 } 976 977 U_CAPI int32_t U_EXPORT2 978 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) { 979 UErrorCode errorCode=U_ZERO_ERROR; 980 return u_strcmpFold(s1, length, s2, length, 981 options|U_COMPARE_IGNORE_CASE, 982 &errorCode); 983 } 984 985 U_CAPI int32_t U_EXPORT2 986 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) { 987 UErrorCode errorCode=U_ZERO_ERROR; 988 return u_strcmpFold(s1, n, s2, n, 989 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE), 990 &errorCode); 991 } 992