1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2001-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ustrcase.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002feb20 14 * created by: Markus W. Scherer 15 * 16 * Implementation file for string casing C API functions. 17 * Uses functions from uchar.c for basic functionality that requires access 18 * to the Unicode Character Database (uprops.dat). 19 */ 20 21 #include "unicode/utypes.h" 22 #include "unicode/uloc.h" 23 #include "unicode/ustring.h" 24 #include "unicode/ucasemap.h" 25 #include "unicode/ubrk.h" 26 #include "cmemory.h" 27 #include "ucase.h" 28 #include "ustr_imp.h" 29 30 /* string casing ------------------------------------------------------------ */ 31 32 /* append a full case mapping result, see UCASE_MAX_STRING_LENGTH */ 33 static U_INLINE int32_t 34 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity, 35 int32_t result, const UChar *s) { 36 UChar32 c; 37 int32_t length; 38 39 /* decode the result */ 40 if(result<0) { 41 /* (not) original code point */ 42 c=~result; 43 length=-1; 44 } else if(result<=UCASE_MAX_STRING_LENGTH) { 45 c=U_SENTINEL; 46 length=result; 47 } else { 48 c=result; 49 length=-1; 50 } 51 52 if(destIndex<destCapacity) { 53 /* append the result */ 54 if(length<0) { 55 /* code point */ 56 UBool isError=FALSE; 57 U16_APPEND(dest, destIndex, destCapacity, c, isError); 58 if(isError) { 59 /* overflow, nothing written */ 60 destIndex+=U16_LENGTH(c); 61 } 62 } else { 63 /* string */ 64 if((destIndex+length)<=destCapacity) { 65 while(length>0) { 66 dest[destIndex++]=*s++; 67 --length; 68 } 69 } else { 70 /* overflow */ 71 destIndex+=length; 72 } 73 } 74 } else { 75 /* preflight */ 76 if(length<0) { 77 destIndex+=U16_LENGTH(c); 78 } else { 79 destIndex+=length; 80 } 81 } 82 return destIndex; 83 } 84 85 static UChar32 U_CALLCONV 86 utf16_caseContextIterator(void *context, int8_t dir) { 87 UCaseContext *csc=(UCaseContext *)context; 88 UChar32 c; 89 90 if(dir<0) { 91 /* reset for backward iteration */ 92 csc->index=csc->cpStart; 93 csc->dir=dir; 94 } else if(dir>0) { 95 /* reset for forward iteration */ 96 csc->index=csc->cpLimit; 97 csc->dir=dir; 98 } else { 99 /* continue current iteration direction */ 100 dir=csc->dir; 101 } 102 103 if(dir<0) { 104 if(csc->start<csc->index) { 105 U16_PREV((const UChar *)csc->p, csc->start, csc->index, c); 106 return c; 107 } 108 } else { 109 if(csc->index<csc->limit) { 110 U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c); 111 return c; 112 } 113 } 114 return U_SENTINEL; 115 } 116 117 /* 118 * Case-maps [srcStart..srcLimit[ but takes 119 * context [0..srcLength[ into account. 120 */ 121 static int32_t 122 _caseMap(const UCaseMap *csm, UCaseMapFull *map, 123 UChar *dest, int32_t destCapacity, 124 const UChar *src, UCaseContext *csc, 125 int32_t srcStart, int32_t srcLimit, 126 UErrorCode *pErrorCode) { 127 const UChar *s; 128 UChar32 c, c2 = 0; 129 int32_t srcIndex, destIndex; 130 int32_t locCache; 131 132 locCache=csm->locCache; 133 134 /* case mapping loop */ 135 srcIndex=srcStart; 136 destIndex=0; 137 while(srcIndex<srcLimit) { 138 csc->cpStart=srcIndex; 139 U16_NEXT(src, srcIndex, srcLimit, c); 140 csc->cpLimit=srcIndex; 141 c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache); 142 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) { 143 /* fast path version of appendResult() for BMP results */ 144 dest[destIndex++]=(UChar)c2; 145 } else { 146 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 147 } 148 } 149 150 if(destIndex>destCapacity) { 151 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 152 } 153 return destIndex; 154 } 155 156 static void 157 setTempCaseMapLocale(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { 158 /* 159 * We could call ucasemap_setLocale(), but here we really only care about 160 * the initial language subtag, we need not return the real string via 161 * ucasemap_getLocale(), and we don't care about only getting "x" from 162 * "x-some-thing" etc. 163 * 164 * We ignore locales with a longer-than-3 initial subtag. 165 * 166 * We also do not fill in the locCache because it is rarely used, 167 * and not worth setting unless we reuse it for many case mapping operations. 168 * (That's why UCaseMap was created.) 169 */ 170 int i; 171 char c; 172 173 /* the internal functions require locale!=NULL */ 174 if(locale==NULL) { 175 locale=uloc_getDefault(); 176 } 177 for(i=0; i<4 && (c=locale[i])!=0 && c!='-' && c!='_'; ++i) { 178 csm->locale[i]=c; 179 } 180 if(i<=3) { 181 csm->locale[i]=0; /* Up to 3 non-separator characters. */ 182 } else { 183 csm->locale[0]=0; /* Longer-than-3 initial subtag: Ignore. */ 184 } 185 } 186 187 /* 188 * Set parameters on an empty UCaseMap, for UCaseMap-less API functions. 189 * Do this fast because it is called with every function call. 190 */ 191 static U_INLINE void 192 setTempCaseMap(UCaseMap *csm, const char *locale, UErrorCode *pErrorCode) { 193 if(csm->csp==NULL) { 194 csm->csp=ucase_getSingleton(); 195 } 196 if(locale!=NULL && locale[0]==0) { 197 csm->locale[0]=0; 198 } else { 199 setTempCaseMapLocale(csm, locale, pErrorCode); 200 } 201 } 202 203 #if !UCONFIG_NO_BREAK_ITERATION 204 205 /* 206 * Internal titlecasing function. 207 */ 208 static int32_t 209 _toTitle(UCaseMap *csm, 210 UChar *dest, int32_t destCapacity, 211 const UChar *src, UCaseContext *csc, 212 int32_t srcLength, 213 UErrorCode *pErrorCode) { 214 const UChar *s; 215 UChar32 c; 216 int32_t prev, titleStart, titleLimit, idx, destIndex, length; 217 UBool isFirstIndex; 218 219 if(csm->iter!=NULL) { 220 ubrk_setText(csm->iter, src, srcLength, pErrorCode); 221 } else { 222 csm->iter=ubrk_open(UBRK_WORD, csm->locale, 223 src, srcLength, 224 pErrorCode); 225 } 226 if(U_FAILURE(*pErrorCode)) { 227 return 0; 228 } 229 230 /* set up local variables */ 231 destIndex=0; 232 prev=0; 233 isFirstIndex=TRUE; 234 235 /* titlecasing loop */ 236 while(prev<srcLength) { 237 /* find next index where to titlecase */ 238 if(isFirstIndex) { 239 isFirstIndex=FALSE; 240 idx=ubrk_first(csm->iter); 241 } else { 242 idx=ubrk_next(csm->iter); 243 } 244 if(idx==UBRK_DONE || idx>srcLength) { 245 idx=srcLength; 246 } 247 248 /* 249 * Unicode 4 & 5 section 3.13 Default Case Operations: 250 * 251 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex 252 * #29, "Text Boundaries." Between each pair of word boundaries, find the first 253 * cased character F. If F exists, map F to default_title(F); then map each 254 * subsequent character C to default_lower(C). 255 * 256 * In this implementation, segment [prev..index[ into 3 parts: 257 * a) uncased characters (copy as-is) [prev..titleStart[ 258 * b) first case letter (titlecase) [titleStart..titleLimit[ 259 * c) subsequent characters (lowercase) [titleLimit..index[ 260 */ 261 if(prev<idx) { 262 /* find and copy uncased characters [prev..titleStart[ */ 263 titleStart=titleLimit=prev; 264 U16_NEXT(src, titleLimit, idx, c); 265 if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { 266 /* Adjust the titlecasing index (titleStart) to the next cased character. */ 267 for(;;) { 268 titleStart=titleLimit; 269 if(titleLimit==idx) { 270 /* 271 * only uncased characters in [prev..index[ 272 * stop with titleStart==titleLimit==index 273 */ 274 break; 275 } 276 U16_NEXT(src, titleLimit, idx, c); 277 if(UCASE_NONE!=ucase_getType(csm->csp, c)) { 278 break; /* cased letter at [titleStart..titleLimit[ */ 279 } 280 } 281 length=titleStart-prev; 282 if(length>0) { 283 if((destIndex+length)<=destCapacity) { 284 uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR); 285 } 286 destIndex+=length; 287 } 288 } 289 290 if(titleStart<titleLimit) { 291 /* titlecase c which is from [titleStart..titleLimit[ */ 292 csc->cpStart=titleStart; 293 csc->cpLimit=titleLimit; 294 c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &csm->locCache); 295 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 296 297 /* Special case Dutch IJ titlecasing */ 298 if ( titleStart+1 < idx && 299 ucase_getCaseLocale(csm->locale,&csm->locCache) == UCASE_LOC_DUTCH && 300 ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) && 301 ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) { 302 c=(UChar32) 0x004A; 303 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 304 titleLimit++; 305 } 306 307 /* lowercase [titleLimit..index[ */ 308 if(titleLimit<idx) { 309 if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) { 310 /* Normal operation: Lowercase the rest of the word. */ 311 destIndex+= 312 _caseMap( 313 csm, ucase_toFullLower, 314 dest+destIndex, destCapacity-destIndex, 315 src, csc, 316 titleLimit, idx, 317 pErrorCode); 318 } else { 319 /* Optionally just copy the rest of the word unchanged. */ 320 length=idx-titleLimit; 321 if((destIndex+length)<=destCapacity) { 322 uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR); 323 } 324 destIndex+=length; 325 } 326 } 327 } 328 } 329 330 prev=idx; 331 } 332 333 if(destIndex>destCapacity) { 334 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 335 } 336 return destIndex; 337 } 338 339 #endif 340 341 /* functions available in the common library (for unistr_case.cpp) */ 342 343 U_CFUNC int32_t 344 ustr_toLower(const UCaseProps *csp, 345 UChar *dest, int32_t destCapacity, 346 const UChar *src, int32_t srcLength, 347 const char *locale, 348 UErrorCode *pErrorCode) { 349 UCaseMap csm={ NULL }; 350 UCaseContext csc={ NULL }; 351 352 csm.csp=csp; 353 setTempCaseMap(&csm, locale, pErrorCode); 354 csc.p=(void *)src; 355 csc.limit=srcLength; 356 357 return _caseMap(&csm, ucase_toFullLower, 358 dest, destCapacity, 359 src, &csc, 0, srcLength, 360 pErrorCode); 361 } 362 363 U_CFUNC int32_t 364 ustr_toUpper(const UCaseProps *csp, 365 UChar *dest, int32_t destCapacity, 366 const UChar *src, int32_t srcLength, 367 const char *locale, 368 UErrorCode *pErrorCode) { 369 UCaseMap csm={ NULL }; 370 UCaseContext csc={ NULL }; 371 372 csm.csp=csp; 373 setTempCaseMap(&csm, locale, pErrorCode); 374 csc.p=(void *)src; 375 csc.limit=srcLength; 376 377 return _caseMap(&csm, ucase_toFullUpper, 378 dest, destCapacity, 379 src, &csc, 0, srcLength, 380 pErrorCode); 381 } 382 383 #if !UCONFIG_NO_BREAK_ITERATION 384 385 U_CFUNC int32_t 386 ustr_toTitle(const UCaseProps *csp, 387 UChar *dest, int32_t destCapacity, 388 const UChar *src, int32_t srcLength, 389 UBreakIterator *titleIter, 390 const char *locale, uint32_t options, 391 UErrorCode *pErrorCode) { 392 UCaseMap csm={ NULL }; 393 UCaseContext csc={ NULL }; 394 int32_t length; 395 396 csm.csp=csp; 397 csm.iter=titleIter; 398 csm.options=options; 399 setTempCaseMap(&csm, locale, pErrorCode); 400 csc.p=(void *)src; 401 csc.limit=srcLength; 402 403 length=_toTitle(&csm, 404 dest, destCapacity, 405 src, &csc, srcLength, 406 pErrorCode); 407 if(titleIter==NULL && csm.iter!=NULL) { 408 ubrk_close(csm.iter); 409 } 410 return length; 411 } 412 413 #endif 414 415 U_CFUNC int32_t 416 ustr_foldCase(const UCaseProps *csp, 417 UChar *dest, int32_t destCapacity, 418 const UChar *src, int32_t srcLength, 419 uint32_t options, 420 UErrorCode *pErrorCode) { 421 int32_t srcIndex, destIndex; 422 423 const UChar *s; 424 UChar32 c, c2 = 0; 425 426 /* case mapping loop */ 427 srcIndex=destIndex=0; 428 while(srcIndex<srcLength) { 429 U16_NEXT(src, srcIndex, srcLength, c); 430 c=ucase_toFullFolding(csp, c, &s, options); 431 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) { 432 /* fast path version of appendResult() for BMP results */ 433 dest[destIndex++]=(UChar)c2; 434 } else { 435 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 436 } 437 } 438 439 if(destIndex>destCapacity) { 440 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 441 } 442 return destIndex; 443 } 444 445 /* 446 * Implement argument checking and buffer handling 447 * for string case mapping as a common function. 448 */ 449 450 /* common internal function for public API functions */ 451 452 static int32_t 453 caseMap(const UCaseMap *csm, 454 UChar *dest, int32_t destCapacity, 455 const UChar *src, int32_t srcLength, 456 int32_t toWhichCase, 457 UErrorCode *pErrorCode) { 458 UChar buffer[300]; 459 UChar *temp; 460 461 int32_t destLength; 462 463 /* check argument values */ 464 if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { 465 return 0; 466 } 467 if( destCapacity<0 || 468 (dest==NULL && destCapacity>0) || 469 src==NULL || 470 srcLength<-1 471 ) { 472 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 473 return 0; 474 } 475 476 /* get the string length */ 477 if(srcLength==-1) { 478 srcLength=u_strlen(src); 479 } 480 481 /* check for overlapping source and destination */ 482 if( dest!=NULL && 483 ((src>=dest && src<(dest+destCapacity)) || 484 (dest>=src && dest<(src+srcLength))) 485 ) { 486 /* overlap: provide a temporary destination buffer and later copy the result */ 487 if(destCapacity<=(sizeof(buffer)/U_SIZEOF_UCHAR)) { 488 /* the stack buffer is large enough */ 489 temp=buffer; 490 } else { 491 /* allocate a buffer */ 492 temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR); 493 if(temp==NULL) { 494 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 495 return 0; 496 } 497 } 498 } else { 499 temp=dest; 500 } 501 502 destLength=0; 503 504 if(toWhichCase==FOLD_CASE) { 505 destLength=ustr_foldCase(csm->csp, temp, destCapacity, src, srcLength, 506 csm->options, pErrorCode); 507 } else { 508 UCaseContext csc={ NULL }; 509 510 csc.p=(void *)src; 511 csc.limit=srcLength; 512 513 if(toWhichCase==TO_LOWER) { 514 destLength=_caseMap(csm, ucase_toFullLower, 515 temp, destCapacity, 516 src, &csc, 517 0, srcLength, 518 pErrorCode); 519 } else if(toWhichCase==TO_UPPER) { 520 destLength=_caseMap(csm, ucase_toFullUpper, 521 temp, destCapacity, 522 src, &csc, 523 0, srcLength, 524 pErrorCode); 525 } else /* if(toWhichCase==TO_TITLE) */ { 526 #if UCONFIG_NO_BREAK_ITERATION 527 *pErrorCode=U_UNSUPPORTED_ERROR; 528 #else 529 /* UCaseMap is actually non-const in toTitle() APIs. */ 530 destLength=_toTitle((UCaseMap *)csm, temp, destCapacity, 531 src, &csc, srcLength, 532 pErrorCode); 533 #endif 534 } 535 } 536 if(temp!=dest) { 537 /* copy the result string to the destination buffer */ 538 if(destLength>0) { 539 int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity; 540 if(copyLength>0) { 541 uprv_memmove(dest, temp, copyLength*U_SIZEOF_UCHAR); 542 } 543 } 544 if(temp!=buffer) { 545 uprv_free(temp); 546 } 547 } 548 549 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); 550 } 551 552 /* public API functions */ 553 554 U_CAPI int32_t U_EXPORT2 555 u_strToLower(UChar *dest, int32_t destCapacity, 556 const UChar *src, int32_t srcLength, 557 const char *locale, 558 UErrorCode *pErrorCode) { 559 UCaseMap csm={ NULL }; 560 setTempCaseMap(&csm, locale, pErrorCode); 561 return caseMap(&csm, 562 dest, destCapacity, 563 src, srcLength, 564 TO_LOWER, pErrorCode); 565 } 566 567 U_CAPI int32_t U_EXPORT2 568 u_strToUpper(UChar *dest, int32_t destCapacity, 569 const UChar *src, int32_t srcLength, 570 const char *locale, 571 UErrorCode *pErrorCode) { 572 UCaseMap csm={ NULL }; 573 setTempCaseMap(&csm, locale, pErrorCode); 574 return caseMap(&csm, 575 dest, destCapacity, 576 src, srcLength, 577 TO_UPPER, pErrorCode); 578 } 579 580 #if !UCONFIG_NO_BREAK_ITERATION 581 582 U_CAPI int32_t U_EXPORT2 583 u_strToTitle(UChar *dest, int32_t destCapacity, 584 const UChar *src, int32_t srcLength, 585 UBreakIterator *titleIter, 586 const char *locale, 587 UErrorCode *pErrorCode) { 588 UCaseMap csm={ NULL }; 589 int32_t length; 590 591 csm.iter=titleIter; 592 setTempCaseMap(&csm, locale, pErrorCode); 593 length=caseMap(&csm, 594 dest, destCapacity, 595 src, srcLength, 596 TO_TITLE, pErrorCode); 597 if(titleIter==NULL && csm.iter!=NULL) { 598 ubrk_close(csm.iter); 599 } 600 return length; 601 } 602 603 U_CAPI int32_t U_EXPORT2 604 ucasemap_toTitle(UCaseMap *csm, 605 UChar *dest, int32_t destCapacity, 606 const UChar *src, int32_t srcLength, 607 UErrorCode *pErrorCode) { 608 return caseMap(csm, 609 dest, destCapacity, 610 src, srcLength, 611 TO_TITLE, pErrorCode); 612 } 613 614 #endif 615 616 U_CAPI int32_t U_EXPORT2 617 u_strFoldCase(UChar *dest, int32_t destCapacity, 618 const UChar *src, int32_t srcLength, 619 uint32_t options, 620 UErrorCode *pErrorCode) { 621 UCaseMap csm={ NULL }; 622 csm.csp=ucase_getSingleton(); 623 csm.options=options; 624 return caseMap(&csm, 625 dest, destCapacity, 626 src, srcLength, 627 FOLD_CASE, pErrorCode); 628 } 629 630 /* case-insensitive string comparisons -------------------------------------- */ 631 632 /* 633 * This function is a copy of unorm_cmpEquivFold() minus the parts for 634 * canonical equivalence. 635 * Keep the functions in sync, and see there for how this works. 636 * The duplication is for modularization: 637 * It makes caseless (but not canonical caseless) matches independent of 638 * the normalization code. 639 */ 640 641 /* stack element for previous-level source/decomposition pointers */ 642 struct CmpEquivLevel { 643 const UChar *start, *s, *limit; 644 }; 645 typedef struct CmpEquivLevel CmpEquivLevel; 646 647 /* internal function */ 648 U_CFUNC int32_t 649 u_strcmpFold(const UChar *s1, int32_t length1, 650 const UChar *s2, int32_t length2, 651 uint32_t options, 652 UErrorCode *pErrorCode) { 653 const UCaseProps *csp; 654 655 /* current-level start/limit - s1/s2 as current */ 656 const UChar *start1, *start2, *limit1, *limit2; 657 658 /* case folding variables */ 659 const UChar *p; 660 int32_t length; 661 662 /* stacks of previous-level start/current/limit */ 663 CmpEquivLevel stack1[2], stack2[2]; 664 665 /* case folding buffers, only use current-level start/limit */ 666 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; 667 668 /* track which is the current level per string */ 669 int32_t level1, level2; 670 671 /* current code units, and code points for lookups */ 672 UChar32 c1, c2, cp1, cp2; 673 674 /* no argument error checking because this itself is not an API */ 675 676 /* 677 * assume that at least the option U_COMPARE_IGNORE_CASE is set 678 * otherwise this function would have to behave exactly as uprv_strCompare() 679 */ 680 csp=ucase_getSingleton(); 681 if(U_FAILURE(*pErrorCode)) { 682 return 0; 683 } 684 685 /* initialize */ 686 start1=s1; 687 if(length1==-1) { 688 limit1=NULL; 689 } else { 690 limit1=s1+length1; 691 } 692 693 start2=s2; 694 if(length2==-1) { 695 limit2=NULL; 696 } else { 697 limit2=s2+length2; 698 } 699 700 level1=level2=0; 701 c1=c2=-1; 702 703 /* comparison loop */ 704 for(;;) { 705 /* 706 * here a code unit value of -1 means "get another code unit" 707 * below it will mean "this source is finished" 708 */ 709 710 if(c1<0) { 711 /* get next code unit from string 1, post-increment */ 712 for(;;) { 713 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { 714 if(level1==0) { 715 c1=-1; 716 break; 717 } 718 } else { 719 ++s1; 720 break; 721 } 722 723 /* reached end of level buffer, pop one level */ 724 do { 725 --level1; 726 start1=stack1[level1].start; 727 } while(start1==NULL); 728 s1=stack1[level1].s; 729 limit1=stack1[level1].limit; 730 } 731 } 732 733 if(c2<0) { 734 /* get next code unit from string 2, post-increment */ 735 for(;;) { 736 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { 737 if(level2==0) { 738 c2=-1; 739 break; 740 } 741 } else { 742 ++s2; 743 break; 744 } 745 746 /* reached end of level buffer, pop one level */ 747 do { 748 --level2; 749 start2=stack2[level2].start; 750 } while(start2==NULL); 751 s2=stack2[level2].s; 752 limit2=stack2[level2].limit; 753 } 754 } 755 756 /* 757 * compare c1 and c2 758 * either variable c1, c2 is -1 only if the corresponding string is finished 759 */ 760 if(c1==c2) { 761 if(c1<0) { 762 return 0; /* c1==c2==-1 indicating end of strings */ 763 } 764 c1=c2=-1; /* make us fetch new code units */ 765 continue; 766 } else if(c1<0) { 767 return -1; /* string 1 ends before string 2 */ 768 } else if(c2<0) { 769 return 1; /* string 2 ends before string 1 */ 770 } 771 /* c1!=c2 && c1>=0 && c2>=0 */ 772 773 /* get complete code points for c1, c2 for lookups if either is a surrogate */ 774 cp1=c1; 775 if(U_IS_SURROGATE(c1)) { 776 UChar c; 777 778 if(U_IS_SURROGATE_LEAD(c1)) { 779 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { 780 /* advance ++s1; only below if cp1 decomposes/case-folds */ 781 cp1=U16_GET_SUPPLEMENTARY(c1, c); 782 } 783 } else /* isTrail(c1) */ { 784 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { 785 cp1=U16_GET_SUPPLEMENTARY(c, c1); 786 } 787 } 788 } 789 790 cp2=c2; 791 if(U_IS_SURROGATE(c2)) { 792 UChar c; 793 794 if(U_IS_SURROGATE_LEAD(c2)) { 795 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { 796 /* advance ++s2; only below if cp2 decomposes/case-folds */ 797 cp2=U16_GET_SUPPLEMENTARY(c2, c); 798 } 799 } else /* isTrail(c2) */ { 800 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { 801 cp2=U16_GET_SUPPLEMENTARY(c, c2); 802 } 803 } 804 } 805 806 /* 807 * go down one level for each string 808 * continue with the main loop as soon as there is a real change 809 */ 810 811 if( level1==0 && 812 (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0 813 ) { 814 /* cp1 case-folds to the code point "length" or to p[length] */ 815 if(U_IS_SURROGATE(c1)) { 816 if(U_IS_SURROGATE_LEAD(c1)) { 817 /* advance beyond source surrogate pair if it case-folds */ 818 ++s1; 819 } else /* isTrail(c1) */ { 820 /* 821 * we got a supplementary code point when hitting its trail surrogate, 822 * therefore the lead surrogate must have been the same as in the other string; 823 * compare this decomposition with the lead surrogate in the other string 824 * remember that this simulates bulk text replacement: 825 * the decomposition would replace the entire code point 826 */ 827 --s2; 828 c2=*(s2-1); 829 } 830 } 831 832 /* push current level pointers */ 833 stack1[0].start=start1; 834 stack1[0].s=s1; 835 stack1[0].limit=limit1; 836 ++level1; 837 838 /* copy the folding result to fold1[] */ 839 if(length<=UCASE_MAX_STRING_LENGTH) { 840 u_memcpy(fold1, p, length); 841 } else { 842 int32_t i=0; 843 U16_APPEND_UNSAFE(fold1, i, length); 844 length=i; 845 } 846 847 /* set next level pointers to case folding */ 848 start1=s1=fold1; 849 limit1=fold1+length; 850 851 /* get ready to read from decomposition, continue with loop */ 852 c1=-1; 853 continue; 854 } 855 856 if( level2==0 && 857 (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0 858 ) { 859 /* cp2 case-folds to the code point "length" or to p[length] */ 860 if(U_IS_SURROGATE(c2)) { 861 if(U_IS_SURROGATE_LEAD(c2)) { 862 /* advance beyond source surrogate pair if it case-folds */ 863 ++s2; 864 } else /* isTrail(c2) */ { 865 /* 866 * we got a supplementary code point when hitting its trail surrogate, 867 * therefore the lead surrogate must have been the same as in the other string; 868 * compare this decomposition with the lead surrogate in the other string 869 * remember that this simulates bulk text replacement: 870 * the decomposition would replace the entire code point 871 */ 872 --s1; 873 c1=*(s1-1); 874 } 875 } 876 877 /* push current level pointers */ 878 stack2[0].start=start2; 879 stack2[0].s=s2; 880 stack2[0].limit=limit2; 881 ++level2; 882 883 /* copy the folding result to fold2[] */ 884 if(length<=UCASE_MAX_STRING_LENGTH) { 885 u_memcpy(fold2, p, length); 886 } else { 887 int32_t i=0; 888 U16_APPEND_UNSAFE(fold2, i, length); 889 length=i; 890 } 891 892 /* set next level pointers to case folding */ 893 start2=s2=fold2; 894 limit2=fold2+length; 895 896 /* get ready to read from decomposition, continue with loop */ 897 c2=-1; 898 continue; 899 } 900 901 /* 902 * no decomposition/case folding, max level for both sides: 903 * return difference result 904 * 905 * code point order comparison must not just return cp1-cp2 906 * because when single surrogates are present then the surrogate pairs 907 * that formed cp1 and cp2 may be from different string indexes 908 * 909 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units 910 * c1=d800 cp1=10001 c2=dc00 cp2=10000 911 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } 912 * 913 * therefore, use same fix-up as in ustring.c/uprv_strCompare() 914 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ 915 * so we have slightly different pointer/start/limit comparisons here 916 */ 917 918 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { 919 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ 920 if( 921 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || 922 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) 923 ) { 924 /* part of a surrogate pair, leave >=d800 */ 925 } else { 926 /* BMP code point - may be surrogate code point - make <d800 */ 927 c1-=0x2800; 928 } 929 930 if( 931 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || 932 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) 933 ) { 934 /* part of a surrogate pair, leave >=d800 */ 935 } else { 936 /* BMP code point - may be surrogate code point - make <d800 */ 937 c2-=0x2800; 938 } 939 } 940 941 return c1-c2; 942 } 943 } 944 945 /* public API functions */ 946 947 U_CAPI int32_t U_EXPORT2 948 u_strCaseCompare(const UChar *s1, int32_t length1, 949 const UChar *s2, int32_t length2, 950 uint32_t options, 951 UErrorCode *pErrorCode) { 952 /* argument checking */ 953 if(pErrorCode==0 || U_FAILURE(*pErrorCode)) { 954 return 0; 955 } 956 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { 957 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 958 return 0; 959 } 960 return u_strcmpFold(s1, length1, s2, length2, 961 options|U_COMPARE_IGNORE_CASE, 962 pErrorCode); 963 } 964 965 U_CAPI int32_t U_EXPORT2 966 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) { 967 UErrorCode errorCode=U_ZERO_ERROR; 968 return u_strcmpFold(s1, -1, s2, -1, 969 options|U_COMPARE_IGNORE_CASE, 970 &errorCode); 971 } 972 973 U_CAPI int32_t U_EXPORT2 974 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) { 975 UErrorCode errorCode=U_ZERO_ERROR; 976 return u_strcmpFold(s1, length, s2, length, 977 options|U_COMPARE_IGNORE_CASE, 978 &errorCode); 979 } 980 981 U_CAPI int32_t U_EXPORT2 982 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) { 983 UErrorCode errorCode=U_ZERO_ERROR; 984 return u_strcmpFold(s1, n, s2, n, 985 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE), 986 &errorCode); 987 } 988