1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2001-2015, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ustrcase.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2002feb20 14 * created by: Markus W. Scherer 15 * 16 * Implementation file for string casing C API functions. 17 * Uses functions from uchar.c for basic functionality that requires access 18 * to the Unicode Character Database (uprops.dat). 19 */ 20 21 #include "unicode/utypes.h" 22 #include "unicode/brkiter.h" 23 #include "unicode/ustring.h" 24 #include "unicode/ucasemap.h" 25 #include "unicode/ubrk.h" 26 #include "unicode/utf.h" 27 #include "unicode/utf16.h" 28 #include "cmemory.h" 29 #include "ucase.h" 30 #include "ustr_imp.h" 31 #include "uassert.h" 32 33 U_NAMESPACE_USE 34 35 /* string casing ------------------------------------------------------------ */ 36 37 /* Appends a full case mapping result, see UCASE_MAX_STRING_LENGTH. */ 38 static inline int32_t 39 appendResult(UChar *dest, int32_t destIndex, int32_t destCapacity, 40 int32_t result, const UChar *s) { 41 UChar32 c; 42 int32_t length; 43 44 /* decode the result */ 45 if(result<0) { 46 /* (not) original code point */ 47 c=~result; 48 length=-1; 49 } else if(result<=UCASE_MAX_STRING_LENGTH) { 50 c=U_SENTINEL; 51 length=result; 52 } else { 53 c=result; 54 length=-1; 55 } 56 57 if(destIndex<destCapacity) { 58 /* append the result */ 59 if(length<0) { 60 /* code point */ 61 UBool isError=FALSE; 62 U16_APPEND(dest, destIndex, destCapacity, c, isError); 63 if(isError) { 64 /* overflow, nothing written */ 65 destIndex+=U16_LENGTH(c); 66 } 67 } else { 68 /* string */ 69 if((destIndex+length)<=destCapacity) { 70 while(length>0) { 71 dest[destIndex++]=*s++; 72 --length; 73 } 74 } else { 75 /* overflow */ 76 destIndex+=length; 77 } 78 } 79 } else { 80 /* preflight */ 81 if(length<0) { 82 destIndex+=U16_LENGTH(c); 83 } else { 84 destIndex+=length; 85 } 86 } 87 return destIndex; 88 } 89 90 static UChar32 U_CALLCONV 91 utf16_caseContextIterator(void *context, int8_t dir) { 92 UCaseContext *csc=(UCaseContext *)context; 93 UChar32 c; 94 95 if(dir<0) { 96 /* reset for backward iteration */ 97 csc->index=csc->cpStart; 98 csc->dir=dir; 99 } else if(dir>0) { 100 /* reset for forward iteration */ 101 csc->index=csc->cpLimit; 102 csc->dir=dir; 103 } else { 104 /* continue current iteration direction */ 105 dir=csc->dir; 106 } 107 108 if(dir<0) { 109 if(csc->start<csc->index) { 110 U16_PREV((const UChar *)csc->p, csc->start, csc->index, c); 111 return c; 112 } 113 } else { 114 if(csc->index<csc->limit) { 115 U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c); 116 return c; 117 } 118 } 119 return U_SENTINEL; 120 } 121 122 /* 123 * Case-maps [srcStart..srcLimit[ but takes 124 * context [0..srcLength[ into account. 125 */ 126 static int32_t 127 _caseMap(const UCaseMap *csm, UCaseMapFull *map, 128 UChar *dest, int32_t destCapacity, 129 const UChar *src, UCaseContext *csc, 130 int32_t srcStart, int32_t srcLimit, 131 UErrorCode *pErrorCode) { 132 const UChar *s; 133 UChar32 c, c2 = 0; 134 int32_t srcIndex, destIndex; 135 int32_t locCache; 136 137 locCache=csm->locCache; 138 139 /* case mapping loop */ 140 srcIndex=srcStart; 141 destIndex=0; 142 while(srcIndex<srcLimit) { 143 csc->cpStart=srcIndex; 144 U16_NEXT(src, srcIndex, srcLimit, c); 145 csc->cpLimit=srcIndex; 146 c=map(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &locCache); 147 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) { 148 /* fast path version of appendResult() for BMP results */ 149 dest[destIndex++]=(UChar)c2; 150 } else { 151 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 152 } 153 } 154 155 if(destIndex>destCapacity) { 156 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 157 } 158 return destIndex; 159 } 160 161 #if !UCONFIG_NO_BREAK_ITERATION 162 163 U_CFUNC int32_t U_CALLCONV 164 ustrcase_internalToTitle(const UCaseMap *csm, 165 UChar *dest, int32_t destCapacity, 166 const UChar *src, int32_t srcLength, 167 UErrorCode *pErrorCode) { 168 const UChar *s; 169 UChar32 c; 170 int32_t prev, titleStart, titleLimit, idx, destIndex, length; 171 UBool isFirstIndex; 172 173 if(U_FAILURE(*pErrorCode)) { 174 return 0; 175 } 176 177 // Use the C++ abstract base class to minimize dependencies. 178 // TODO: Change UCaseMap.iter to store a BreakIterator directly. 179 BreakIterator *bi=reinterpret_cast<BreakIterator *>(csm->iter); 180 181 /* set up local variables */ 182 int32_t locCache=csm->locCache; 183 UCaseContext csc=UCASECONTEXT_INITIALIZER; 184 csc.p=(void *)src; 185 csc.limit=srcLength; 186 destIndex=0; 187 prev=0; 188 isFirstIndex=TRUE; 189 190 /* titlecasing loop */ 191 while(prev<srcLength) { 192 /* find next index where to titlecase */ 193 if(isFirstIndex) { 194 isFirstIndex=FALSE; 195 idx=bi->first(); 196 } else { 197 idx=bi->next(); 198 } 199 if(idx==UBRK_DONE || idx>srcLength) { 200 idx=srcLength; 201 } 202 203 /* 204 * Unicode 4 & 5 section 3.13 Default Case Operations: 205 * 206 * R3 toTitlecase(X): Find the word boundaries based on Unicode Standard Annex 207 * #29, "Text Boundaries." Between each pair of word boundaries, find the first 208 * cased character F. If F exists, map F to default_title(F); then map each 209 * subsequent character C to default_lower(C). 210 * 211 * In this implementation, segment [prev..index[ into 3 parts: 212 * a) uncased characters (copy as-is) [prev..titleStart[ 213 * b) first case letter (titlecase) [titleStart..titleLimit[ 214 * c) subsequent characters (lowercase) [titleLimit..index[ 215 */ 216 if(prev<idx) { 217 /* find and copy uncased characters [prev..titleStart[ */ 218 titleStart=titleLimit=prev; 219 U16_NEXT(src, titleLimit, idx, c); 220 if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) { 221 /* Adjust the titlecasing index (titleStart) to the next cased character. */ 222 for(;;) { 223 titleStart=titleLimit; 224 if(titleLimit==idx) { 225 /* 226 * only uncased characters in [prev..index[ 227 * stop with titleStart==titleLimit==index 228 */ 229 break; 230 } 231 U16_NEXT(src, titleLimit, idx, c); 232 if(UCASE_NONE!=ucase_getType(csm->csp, c)) { 233 break; /* cased letter at [titleStart..titleLimit[ */ 234 } 235 } 236 length=titleStart-prev; 237 if(length>0) { 238 if((destIndex+length)<=destCapacity) { 239 uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR); 240 } 241 destIndex+=length; 242 } 243 } 244 245 if(titleStart<titleLimit) { 246 /* titlecase c which is from [titleStart..titleLimit[ */ 247 csc.cpStart=titleStart; 248 csc.cpLimit=titleLimit; 249 c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, &csc, &s, csm->locale, &locCache); 250 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 251 252 /* Special case Dutch IJ titlecasing */ 253 if ( titleStart+1 < idx && 254 ucase_getCaseLocale(csm->locale,&locCache) == UCASE_LOC_DUTCH && 255 ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) && 256 ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) { 257 c=(UChar32) 0x004A; 258 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 259 titleLimit++; 260 } 261 262 /* lowercase [titleLimit..index[ */ 263 if(titleLimit<idx) { 264 if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) { 265 /* Normal operation: Lowercase the rest of the word. */ 266 destIndex+= 267 _caseMap( 268 csm, ucase_toFullLower, 269 dest+destIndex, destCapacity-destIndex, 270 src, &csc, 271 titleLimit, idx, 272 pErrorCode); 273 } else { 274 /* Optionally just copy the rest of the word unchanged. */ 275 length=idx-titleLimit; 276 if((destIndex+length)<=destCapacity) { 277 uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR); 278 } 279 destIndex+=length; 280 } 281 } 282 } 283 } 284 285 prev=idx; 286 } 287 288 if(destIndex>destCapacity) { 289 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 290 } 291 return destIndex; 292 } 293 294 #endif // !UCONFIG_NO_BREAK_ITERATION 295 296 /* functions available in the common library (for unistr_case.cpp) */ 297 298 U_CFUNC int32_t U_CALLCONV 299 ustrcase_internalToLower(const UCaseMap *csm, 300 UChar *dest, int32_t destCapacity, 301 const UChar *src, int32_t srcLength, 302 UErrorCode *pErrorCode) { 303 UCaseContext csc=UCASECONTEXT_INITIALIZER; 304 csc.p=(void *)src; 305 csc.limit=srcLength; 306 return _caseMap( 307 csm, ucase_toFullLower, 308 dest, destCapacity, 309 src, &csc, 0, srcLength, 310 pErrorCode); 311 } 312 313 U_CFUNC int32_t U_CALLCONV 314 ustrcase_internalToUpper(const UCaseMap *csm, 315 UChar *dest, int32_t destCapacity, 316 const UChar *src, int32_t srcLength, 317 UErrorCode *pErrorCode) { 318 UCaseContext csc=UCASECONTEXT_INITIALIZER; 319 csc.p=(void *)src; 320 csc.limit=srcLength; 321 return _caseMap( 322 csm, ucase_toFullUpper, 323 dest, destCapacity, 324 src, &csc, 0, srcLength, 325 pErrorCode); 326 } 327 328 static int32_t 329 ustr_foldCase(const UCaseProps *csp, 330 UChar *dest, int32_t destCapacity, 331 const UChar *src, int32_t srcLength, 332 uint32_t options, 333 UErrorCode *pErrorCode) { 334 int32_t srcIndex, destIndex; 335 336 const UChar *s; 337 UChar32 c, c2 = 0; 338 339 /* case mapping loop */ 340 srcIndex=destIndex=0; 341 while(srcIndex<srcLength) { 342 U16_NEXT(src, srcIndex, srcLength, c); 343 c=ucase_toFullFolding(csp, c, &s, options); 344 if((destIndex<destCapacity) && (c<0 ? (c2=~c)<=0xffff : UCASE_MAX_STRING_LENGTH<c && (c2=c)<=0xffff)) { 345 /* fast path version of appendResult() for BMP results */ 346 dest[destIndex++]=(UChar)c2; 347 } else { 348 destIndex=appendResult(dest, destIndex, destCapacity, c, s); 349 } 350 } 351 352 if(destIndex>destCapacity) { 353 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 354 } 355 return destIndex; 356 } 357 358 U_CFUNC int32_t U_CALLCONV 359 ustrcase_internalFold(const UCaseMap *csm, 360 UChar *dest, int32_t destCapacity, 361 const UChar *src, int32_t srcLength, 362 UErrorCode *pErrorCode) { 363 return ustr_foldCase(csm->csp, dest, destCapacity, src, srcLength, csm->options, pErrorCode); 364 } 365 366 U_CFUNC int32_t 367 ustrcase_map(const UCaseMap *csm, 368 UChar *dest, int32_t destCapacity, 369 const UChar *src, int32_t srcLength, 370 UStringCaseMapper *stringCaseMapper, 371 UErrorCode *pErrorCode) { 372 UChar buffer[300]; 373 UChar *temp; 374 375 int32_t destLength; 376 377 /* check argument values */ 378 if(U_FAILURE(*pErrorCode)) { 379 return 0; 380 } 381 if( destCapacity<0 || 382 (dest==NULL && destCapacity>0) || 383 src==NULL || 384 srcLength<-1 385 ) { 386 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 387 return 0; 388 } 389 390 /* get the string length */ 391 if(srcLength==-1) { 392 srcLength=u_strlen(src); 393 } 394 395 /* check for overlapping source and destination */ 396 if( dest!=NULL && 397 ((src>=dest && src<(dest+destCapacity)) || 398 (dest>=src && dest<(src+srcLength))) 399 ) { 400 /* overlap: provide a temporary destination buffer and later copy the result */ 401 if(destCapacity<=UPRV_LENGTHOF(buffer)) { 402 /* the stack buffer is large enough */ 403 temp=buffer; 404 } else { 405 /* allocate a buffer */ 406 temp=(UChar *)uprv_malloc(destCapacity*U_SIZEOF_UCHAR); 407 if(temp==NULL) { 408 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 409 return 0; 410 } 411 } 412 } else { 413 temp=dest; 414 } 415 416 destLength=stringCaseMapper(csm, temp, destCapacity, src, srcLength, pErrorCode); 417 if(temp!=dest) { 418 /* copy the result string to the destination buffer */ 419 if(destLength>0) { 420 int32_t copyLength= destLength<=destCapacity ? destLength : destCapacity; 421 if(copyLength>0) { 422 uprv_memmove(dest, temp, copyLength*U_SIZEOF_UCHAR); 423 } 424 } 425 if(temp!=buffer) { 426 uprv_free(temp); 427 } 428 } 429 430 return u_terminateUChars(dest, destCapacity, destLength, pErrorCode); 431 } 432 433 /* public API functions */ 434 435 U_CAPI int32_t U_EXPORT2 436 u_strFoldCase(UChar *dest, int32_t destCapacity, 437 const UChar *src, int32_t srcLength, 438 uint32_t options, 439 UErrorCode *pErrorCode) { 440 UCaseMap csm=UCASEMAP_INITIALIZER; 441 csm.csp=ucase_getSingleton(); 442 csm.options=options; 443 return ustrcase_map( 444 &csm, 445 dest, destCapacity, 446 src, srcLength, 447 ustrcase_internalFold, pErrorCode); 448 } 449 450 /* case-insensitive string comparisons -------------------------------------- */ 451 452 /* 453 * This function is a copy of unorm_cmpEquivFold() minus the parts for 454 * canonical equivalence. 455 * Keep the functions in sync, and see there for how this works. 456 * The duplication is for modularization: 457 * It makes caseless (but not canonical caseless) matches independent of 458 * the normalization code. 459 */ 460 461 /* stack element for previous-level source/decomposition pointers */ 462 struct CmpEquivLevel { 463 const UChar *start, *s, *limit; 464 }; 465 typedef struct CmpEquivLevel CmpEquivLevel; 466 467 /** 468 * Internal implementation code comparing string with case fold. 469 * This function is called from u_strcmpFold() and u_caseInsensitivePrefixMatch(). 470 * 471 * @param s1 input string 1 472 * @param length1 length of string 1, or -1 (NULL terminated) 473 * @param s2 input string 2 474 * @param length2 length of string 2, or -1 (NULL terminated) 475 * @param options compare options 476 * @param matchLen1 (output) length of partial prefix match in s1 477 * @param matchLen2 (output) length of partial prefix match in s2 478 * @param pErrorCode receives error status 479 * @return The result of comparison 480 */ 481 static int32_t _cmpFold( 482 const UChar *s1, int32_t length1, 483 const UChar *s2, int32_t length2, 484 uint32_t options, 485 int32_t *matchLen1, int32_t *matchLen2, 486 UErrorCode *pErrorCode) { 487 int32_t cmpRes = 0; 488 489 const UCaseProps *csp; 490 491 /* current-level start/limit - s1/s2 as current */ 492 const UChar *start1, *start2, *limit1, *limit2; 493 494 /* points to the original start address */ 495 const UChar *org1, *org2; 496 497 /* points to the end of match + 1 */ 498 const UChar *m1, *m2; 499 500 /* case folding variables */ 501 const UChar *p; 502 int32_t length; 503 504 /* stacks of previous-level start/current/limit */ 505 CmpEquivLevel stack1[2], stack2[2]; 506 507 /* case folding buffers, only use current-level start/limit */ 508 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; 509 510 /* track which is the current level per string */ 511 int32_t level1, level2; 512 513 /* current code units, and code points for lookups */ 514 UChar32 c1, c2, cp1, cp2; 515 516 /* no argument error checking because this itself is not an API */ 517 518 /* 519 * assume that at least the option U_COMPARE_IGNORE_CASE is set 520 * otherwise this function would have to behave exactly as uprv_strCompare() 521 */ 522 csp=ucase_getSingleton(); 523 if(U_FAILURE(*pErrorCode)) { 524 return 0; 525 } 526 527 /* initialize */ 528 if(matchLen1) { 529 U_ASSERT(matchLen2 !=NULL); 530 *matchLen1=0; 531 *matchLen2=0; 532 } 533 534 start1=m1=org1=s1; 535 if(length1==-1) { 536 limit1=NULL; 537 } else { 538 limit1=s1+length1; 539 } 540 541 start2=m2=org2=s2; 542 if(length2==-1) { 543 limit2=NULL; 544 } else { 545 limit2=s2+length2; 546 } 547 548 level1=level2=0; 549 c1=c2=-1; 550 551 /* comparison loop */ 552 for(;;) { 553 /* 554 * here a code unit value of -1 means "get another code unit" 555 * below it will mean "this source is finished" 556 */ 557 558 if(c1<0) { 559 /* get next code unit from string 1, post-increment */ 560 for(;;) { 561 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { 562 if(level1==0) { 563 c1=-1; 564 break; 565 } 566 } else { 567 ++s1; 568 break; 569 } 570 571 /* reached end of level buffer, pop one level */ 572 do { 573 --level1; 574 start1=stack1[level1].start; /*Not uninitialized*/ 575 } while(start1==NULL); 576 s1=stack1[level1].s; /*Not uninitialized*/ 577 limit1=stack1[level1].limit; /*Not uninitialized*/ 578 } 579 } 580 581 if(c2<0) { 582 /* get next code unit from string 2, post-increment */ 583 for(;;) { 584 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { 585 if(level2==0) { 586 c2=-1; 587 break; 588 } 589 } else { 590 ++s2; 591 break; 592 } 593 594 /* reached end of level buffer, pop one level */ 595 do { 596 --level2; 597 start2=stack2[level2].start; /*Not uninitialized*/ 598 } while(start2==NULL); 599 s2=stack2[level2].s; /*Not uninitialized*/ 600 limit2=stack2[level2].limit; /*Not uninitialized*/ 601 } 602 } 603 604 /* 605 * compare c1 and c2 606 * either variable c1, c2 is -1 only if the corresponding string is finished 607 */ 608 if(c1==c2) { 609 const UChar *next1, *next2; 610 611 if(c1<0) { 612 cmpRes=0; /* c1==c2==-1 indicating end of strings */ 613 break; 614 } 615 616 /* 617 * Note: Move the match positions in both strings at the same time 618 * only when corresponding code point(s) in the original strings 619 * are fully consumed. For example, when comparing s1="Fust" and 620 * s2="Fu\u00dfball", s2[2] is folded into "ss", and s1[2] matches 621 * the first code point in the case-folded data. But the second "s" 622 * has no matching code point in s1, so this implementation returns 623 * 2 as the prefix match length ("Fu"). 624 */ 625 next1=next2=NULL; 626 if(level1==0) { 627 next1=s1; 628 } else if(s1==limit1) { 629 /* Note: This implementation only use a single level of stack. 630 * If this code needs to be changed to use multiple levels 631 * of stacks, the code above should check if the current 632 * code is at the end of all stacks. 633 */ 634 U_ASSERT(level1==1); 635 636 /* is s1 at the end of the current stack? */ 637 next1=stack1[0].s; 638 } 639 640 if (next1!=NULL) { 641 if(level2==0) { 642 next2=s2; 643 } else if(s2==limit2) { 644 U_ASSERT(level2==1); 645 646 /* is s2 at the end of the current stack? */ 647 next2=stack2[0].s; 648 } 649 if(next2!=NULL) { 650 m1=next1; 651 m2=next2; 652 } 653 } 654 c1=c2=-1; /* make us fetch new code units */ 655 continue; 656 } else if(c1<0) { 657 cmpRes=-1; /* string 1 ends before string 2 */ 658 break; 659 } else if(c2<0) { 660 cmpRes=1; /* string 2 ends before string 1 */ 661 break; 662 } 663 /* c1!=c2 && c1>=0 && c2>=0 */ 664 665 /* get complete code points for c1, c2 for lookups if either is a surrogate */ 666 cp1=c1; 667 if(U_IS_SURROGATE(c1)) { 668 UChar c; 669 670 if(U_IS_SURROGATE_LEAD(c1)) { 671 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { 672 /* advance ++s1; only below if cp1 decomposes/case-folds */ 673 cp1=U16_GET_SUPPLEMENTARY(c1, c); 674 } 675 } else /* isTrail(c1) */ { 676 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { 677 cp1=U16_GET_SUPPLEMENTARY(c, c1); 678 } 679 } 680 } 681 682 cp2=c2; 683 if(U_IS_SURROGATE(c2)) { 684 UChar c; 685 686 if(U_IS_SURROGATE_LEAD(c2)) { 687 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { 688 /* advance ++s2; only below if cp2 decomposes/case-folds */ 689 cp2=U16_GET_SUPPLEMENTARY(c2, c); 690 } 691 } else /* isTrail(c2) */ { 692 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { 693 cp2=U16_GET_SUPPLEMENTARY(c, c2); 694 } 695 } 696 } 697 698 /* 699 * go down one level for each string 700 * continue with the main loop as soon as there is a real change 701 */ 702 703 if( level1==0 && 704 (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0 705 ) { 706 /* cp1 case-folds to the code point "length" or to p[length] */ 707 if(U_IS_SURROGATE(c1)) { 708 if(U_IS_SURROGATE_LEAD(c1)) { 709 /* advance beyond source surrogate pair if it case-folds */ 710 ++s1; 711 } else /* isTrail(c1) */ { 712 /* 713 * we got a supplementary code point when hitting its trail surrogate, 714 * therefore the lead surrogate must have been the same as in the other string; 715 * compare this decomposition with the lead surrogate in the other string 716 * remember that this simulates bulk text replacement: 717 * the decomposition would replace the entire code point 718 */ 719 --s2; 720 --m2; 721 c2=*(s2-1); 722 } 723 } 724 725 /* push current level pointers */ 726 stack1[0].start=start1; 727 stack1[0].s=s1; 728 stack1[0].limit=limit1; 729 ++level1; 730 731 /* copy the folding result to fold1[] */ 732 if(length<=UCASE_MAX_STRING_LENGTH) { 733 u_memcpy(fold1, p, length); 734 } else { 735 int32_t i=0; 736 U16_APPEND_UNSAFE(fold1, i, length); 737 length=i; 738 } 739 740 /* set next level pointers to case folding */ 741 start1=s1=fold1; 742 limit1=fold1+length; 743 744 /* get ready to read from decomposition, continue with loop */ 745 c1=-1; 746 continue; 747 } 748 749 if( level2==0 && 750 (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0 751 ) { 752 /* cp2 case-folds to the code point "length" or to p[length] */ 753 if(U_IS_SURROGATE(c2)) { 754 if(U_IS_SURROGATE_LEAD(c2)) { 755 /* advance beyond source surrogate pair if it case-folds */ 756 ++s2; 757 } else /* isTrail(c2) */ { 758 /* 759 * we got a supplementary code point when hitting its trail surrogate, 760 * therefore the lead surrogate must have been the same as in the other string; 761 * compare this decomposition with the lead surrogate in the other string 762 * remember that this simulates bulk text replacement: 763 * the decomposition would replace the entire code point 764 */ 765 --s1; 766 --m2; 767 c1=*(s1-1); 768 } 769 } 770 771 /* push current level pointers */ 772 stack2[0].start=start2; 773 stack2[0].s=s2; 774 stack2[0].limit=limit2; 775 ++level2; 776 777 /* copy the folding result to fold2[] */ 778 if(length<=UCASE_MAX_STRING_LENGTH) { 779 u_memcpy(fold2, p, length); 780 } else { 781 int32_t i=0; 782 U16_APPEND_UNSAFE(fold2, i, length); 783 length=i; 784 } 785 786 /* set next level pointers to case folding */ 787 start2=s2=fold2; 788 limit2=fold2+length; 789 790 /* get ready to read from decomposition, continue with loop */ 791 c2=-1; 792 continue; 793 } 794 795 /* 796 * no decomposition/case folding, max level for both sides: 797 * return difference result 798 * 799 * code point order comparison must not just return cp1-cp2 800 * because when single surrogates are present then the surrogate pairs 801 * that formed cp1 and cp2 may be from different string indexes 802 * 803 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units 804 * c1=d800 cp1=10001 c2=dc00 cp2=10000 805 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } 806 * 807 * therefore, use same fix-up as in ustring.c/uprv_strCompare() 808 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ 809 * so we have slightly different pointer/start/limit comparisons here 810 */ 811 812 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { 813 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ 814 if( 815 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || 816 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) 817 ) { 818 /* part of a surrogate pair, leave >=d800 */ 819 } else { 820 /* BMP code point - may be surrogate code point - make <d800 */ 821 c1-=0x2800; 822 } 823 824 if( 825 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || 826 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) 827 ) { 828 /* part of a surrogate pair, leave >=d800 */ 829 } else { 830 /* BMP code point - may be surrogate code point - make <d800 */ 831 c2-=0x2800; 832 } 833 } 834 835 cmpRes=c1-c2; 836 break; 837 } 838 839 if(matchLen1) { 840 *matchLen1=m1-org1; 841 *matchLen2=m2-org2; 842 } 843 return cmpRes; 844 } 845 846 /* internal function */ 847 U_CFUNC int32_t 848 u_strcmpFold(const UChar *s1, int32_t length1, 849 const UChar *s2, int32_t length2, 850 uint32_t options, 851 UErrorCode *pErrorCode) { 852 return _cmpFold(s1, length1, s2, length2, options, NULL, NULL, pErrorCode); 853 } 854 855 /* public API functions */ 856 857 U_CAPI int32_t U_EXPORT2 858 u_strCaseCompare(const UChar *s1, int32_t length1, 859 const UChar *s2, int32_t length2, 860 uint32_t options, 861 UErrorCode *pErrorCode) { 862 /* argument checking */ 863 if(pErrorCode==0 || U_FAILURE(*pErrorCode)) { 864 return 0; 865 } 866 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { 867 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 868 return 0; 869 } 870 return u_strcmpFold(s1, length1, s2, length2, 871 options|U_COMPARE_IGNORE_CASE, 872 pErrorCode); 873 } 874 875 U_CAPI int32_t U_EXPORT2 876 u_strcasecmp(const UChar *s1, const UChar *s2, uint32_t options) { 877 UErrorCode errorCode=U_ZERO_ERROR; 878 return u_strcmpFold(s1, -1, s2, -1, 879 options|U_COMPARE_IGNORE_CASE, 880 &errorCode); 881 } 882 883 U_CAPI int32_t U_EXPORT2 884 u_memcasecmp(const UChar *s1, const UChar *s2, int32_t length, uint32_t options) { 885 UErrorCode errorCode=U_ZERO_ERROR; 886 return u_strcmpFold(s1, length, s2, length, 887 options|U_COMPARE_IGNORE_CASE, 888 &errorCode); 889 } 890 891 U_CAPI int32_t U_EXPORT2 892 u_strncasecmp(const UChar *s1, const UChar *s2, int32_t n, uint32_t options) { 893 UErrorCode errorCode=U_ZERO_ERROR; 894 return u_strcmpFold(s1, n, s2, n, 895 options|(U_COMPARE_IGNORE_CASE|_STRNCMP_STYLE), 896 &errorCode); 897 } 898 899 /* internal API - detect length of shared prefix */ 900 U_CAPI void 901 u_caseInsensitivePrefixMatch(const UChar *s1, int32_t length1, 902 const UChar *s2, int32_t length2, 903 uint32_t options, 904 int32_t *matchLen1, int32_t *matchLen2, 905 UErrorCode *pErrorCode) { 906 _cmpFold(s1, length1, s2, length2, options, 907 matchLen1, matchLen2, pErrorCode); 908 } 909