1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2004-2009, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ucase.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2004aug30 14 * created by: Markus W. Scherer 15 * 16 * Low-level Unicode character/string case mapping code. 17 * Much code moved here (and modified) from uchar.c. 18 */ 19 20 #include "unicode/utypes.h" 21 #include "unicode/uset.h" 22 #include "unicode/udata.h" /* UDataInfo */ 23 #include "ucmndata.h" /* DataHeader */ 24 #include "udatamem.h" 25 #include "umutex.h" 26 #include "uassert.h" 27 #include "cmemory.h" 28 #include "utrie2.h" 29 #include "ucase.h" 30 #include "ucln_cmn.h" 31 32 struct UCaseProps { 33 UDataMemory *mem; 34 const int32_t *indexes; 35 const uint16_t *exceptions; 36 const UChar *unfold; 37 38 UTrie2 trie; 39 uint8_t formatVersion[4]; 40 }; 41 42 /* data loading etc. -------------------------------------------------------- */ 43 44 #if UCASE_HARDCODE_DATA 45 46 /* ucase_props_data.c is machine-generated by gencase --csource */ 47 #include "ucase_props_data.c" 48 49 #else 50 51 static UBool U_CALLCONV 52 isAcceptable(void *context, 53 const char *type, const char *name, 54 const UDataInfo *pInfo) { 55 if( 56 pInfo->size>=20 && 57 pInfo->isBigEndian==U_IS_BIG_ENDIAN && 58 pInfo->charsetFamily==U_CHARSET_FAMILY && 59 pInfo->dataFormat[0]==UCASE_FMT_0 && /* dataFormat="cAsE" */ 60 pInfo->dataFormat[1]==UCASE_FMT_1 && 61 pInfo->dataFormat[2]==UCASE_FMT_2 && 62 pInfo->dataFormat[3]==UCASE_FMT_3 && 63 pInfo->formatVersion[0]==1 && 64 pInfo->formatVersion[2]==UTRIE_SHIFT && 65 pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT 66 ) { 67 UCaseProps *csp=(UCaseProps *)context; 68 uprv_memcpy(csp->formatVersion, pInfo->formatVersion, 4); 69 return TRUE; 70 } else { 71 return FALSE; 72 } 73 } 74 75 static UCaseProps * 76 ucase_openData(UCaseProps *cspProto, 77 const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) { 78 UCaseProps *csp; 79 int32_t size; 80 81 cspProto->indexes=(const int32_t *)bin; 82 if( (length>=0 && length<16*4) || 83 cspProto->indexes[UCASE_IX_INDEX_TOP]<16 84 ) { 85 /* length or indexes[] too short for minimum indexes[] length of 16 */ 86 *pErrorCode=U_INVALID_FORMAT_ERROR; 87 return NULL; 88 } 89 size=cspProto->indexes[UCASE_IX_INDEX_TOP]*4; 90 if(length>=0) { 91 if(length>=size && length>=cspProto->indexes[UCASE_IX_LENGTH]) { 92 length-=size; 93 } else { 94 /* length too short for indexes[] or for the whole data length */ 95 *pErrorCode=U_INVALID_FORMAT_ERROR; 96 return NULL; 97 } 98 } 99 bin+=size; 100 /* from here on, assume that the sizes of the items fit into the total length */ 101 102 /* unserialize the trie, after indexes[] */ 103 size=cspProto->indexes[UCASE_IX_TRIE_SIZE]; 104 utrie_unserialize(&cspProto->trie, bin, size, pErrorCode); 105 if(U_FAILURE(*pErrorCode)) { 106 return NULL; 107 } 108 bin+=size; 109 110 /* get exceptions[] */ 111 size=2*cspProto->indexes[UCASE_IX_EXC_LENGTH]; 112 cspProto->exceptions=(const uint16_t *)bin; 113 bin+=size; 114 115 /* get unfold[] */ 116 size=2*cspProto->indexes[UCASE_IX_UNFOLD_LENGTH]; 117 if(size!=0) { 118 cspProto->unfold=(const UChar *)bin; 119 bin+=size; 120 } else { 121 cspProto->unfold=NULL; 122 } 123 124 /* allocate, copy, and return the new UCaseProps */ 125 csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps)); 126 if(csp==NULL) { 127 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 128 return NULL; 129 } else { 130 uprv_memcpy(csp, cspProto, sizeof(UCaseProps)); 131 return csp; 132 } 133 } 134 135 U_CAPI UCaseProps * U_EXPORT2 136 ucase_open(UErrorCode *pErrorCode) { 137 UCaseProps cspProto={ NULL }, *csp; 138 139 cspProto.mem=udata_openChoice(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, isAcceptable, &cspProto, pErrorCode); 140 if(U_FAILURE(*pErrorCode)) { 141 return NULL; 142 } 143 144 csp=ucase_openData( 145 &cspProto, 146 udata_getMemory(cspProto.mem), 147 udata_getLength(cspProto.mem), 148 pErrorCode); 149 if(U_FAILURE(*pErrorCode)) { 150 udata_close(cspProto.mem); 151 return NULL; 152 } else { 153 return csp; 154 } 155 } 156 157 U_CAPI UCaseProps * U_EXPORT2 158 ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) { 159 UCaseProps cspProto={ NULL }; 160 const DataHeader *hdr; 161 162 if(U_FAILURE(*pErrorCode)) { 163 return NULL; 164 } 165 if(bin==NULL) { 166 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 167 return NULL; 168 } 169 170 /* check the header */ 171 if(length>=0 && length<20) { 172 *pErrorCode=U_INVALID_FORMAT_ERROR; 173 return NULL; 174 } 175 hdr=(const DataHeader *)bin; 176 if( 177 !(hdr->dataHeader.magic1==0xda && hdr->dataHeader.magic2==0x27 && 178 hdr->info.isBigEndian==U_IS_BIG_ENDIAN && 179 isAcceptable(&cspProto, UCASE_DATA_TYPE, UCASE_DATA_NAME, &hdr->info)) 180 ) { 181 *pErrorCode=U_INVALID_FORMAT_ERROR; 182 return NULL; 183 } 184 185 bin+=hdr->dataHeader.headerSize; 186 if(length>=0) { 187 length-=hdr->dataHeader.headerSize; 188 } 189 return ucase_openData(&cspProto, bin, length, pErrorCode); 190 } 191 192 #endif 193 194 U_CAPI void U_EXPORT2 195 ucase_close(UCaseProps *csp) { 196 if(csp!=NULL) { 197 #if !UCASE_HARDCODE_DATA 198 udata_close(csp->mem); 199 #endif 200 uprv_free(csp); 201 } 202 } 203 204 /* UCaseProps singleton ----------------------------------------------------- */ 205 206 #if !UCASE_HARDCODE_DATA 207 static UCaseProps *gCsp=NULL; 208 static UCaseProps *gCspDummy=NULL; 209 static UErrorCode gErrorCode=U_ZERO_ERROR; 210 static int8_t gHaveData=0; 211 #endif 212 213 #if !UCASE_HARDCODE_DATA 214 static UBool U_CALLCONV ucase_cleanup(void) { 215 ucase_close(gCsp); 216 gCsp=NULL; 217 ucase_close(gCspDummy); 218 gCspDummy=NULL; 219 gErrorCode=U_ZERO_ERROR; 220 gHaveData=0; 221 return TRUE; 222 } 223 #endif 224 225 U_CAPI const UCaseProps * U_EXPORT2 226 ucase_getSingleton(UErrorCode *pErrorCode) { 227 #if UCASE_HARDCODE_DATA 228 if(U_FAILURE(*pErrorCode)) { 229 return NULL; 230 } 231 return &ucase_props_singleton; 232 #else 233 int8_t haveData; 234 235 if(U_FAILURE(*pErrorCode)) { 236 return NULL; 237 } 238 239 UMTX_CHECK(NULL, gHaveData, haveData); 240 241 if(haveData>0) { 242 /* data was loaded */ 243 return gCsp; 244 } else if(haveData<0) { 245 /* data loading failed */ 246 *pErrorCode=gErrorCode; 247 return NULL; 248 } else /* haveData==0 */ { 249 /* load the data */ 250 UCaseProps *csp=ucase_open(pErrorCode); 251 if(U_FAILURE(*pErrorCode)) { 252 gHaveData=-1; 253 gErrorCode=*pErrorCode; 254 return NULL; 255 } 256 257 /* set the static variables */ 258 umtx_lock(NULL); 259 if(gCsp==NULL) { 260 gCsp=csp; 261 csp=NULL; 262 gHaveData=1; 263 ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup); 264 } 265 umtx_unlock(NULL); 266 267 ucase_close(csp); 268 return gCsp; 269 } 270 #endif 271 } 272 273 #if !UCASE_HARDCODE_DATA 274 U_CAPI const UCaseProps * U_EXPORT2 275 ucase_getDummy(UErrorCode *pErrorCode) { 276 UCaseProps *csp; 277 278 if(U_FAILURE(*pErrorCode)) { 279 return NULL; 280 } 281 282 UMTX_CHECK(NULL, gCspDummy, csp); 283 284 if(csp!=NULL) { 285 /* the dummy object was already created */ 286 return csp; 287 } else /* csp==NULL */ { 288 /* create the dummy object */ 289 int32_t *indexes; 290 291 csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps)+UCASE_IX_TOP*4+UTRIE_DUMMY_SIZE); 292 if(csp==NULL) { 293 *pErrorCode=U_MEMORY_ALLOCATION_ERROR; 294 return NULL; 295 } 296 uprv_memset(csp, 0, sizeof(UCaseProps)+UCASE_IX_TOP*4); 297 298 csp->indexes=indexes=(int32_t *)(csp+1); 299 indexes[UCASE_IX_INDEX_TOP]=UCASE_IX_TOP; 300 301 indexes[UCASE_IX_TRIE_SIZE]= 302 utrie_unserializeDummy(&csp->trie, indexes+UCASE_IX_TOP, UTRIE_DUMMY_SIZE, 0, 0, TRUE, pErrorCode); 303 if(U_FAILURE(*pErrorCode)) { 304 uprv_free(csp); 305 return NULL; 306 } 307 308 csp->formatVersion[0]=1; 309 csp->formatVersion[2]=UTRIE_SHIFT; 310 csp->formatVersion[3]=UTRIE_INDEX_SHIFT; 311 312 /* set the static variables */ 313 umtx_lock(NULL); 314 if(gCspDummy==NULL) { 315 gCspDummy=csp; 316 csp=NULL; 317 ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup); 318 } 319 umtx_unlock(NULL); 320 321 uprv_free(csp); 322 return gCspDummy; 323 } 324 } 325 #endif 326 327 /* set of property starts for UnicodeSet ------------------------------------ */ 328 329 static UBool U_CALLCONV 330 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) { 331 /* add the start code point to the USet */ 332 const USetAdder *sa=(const USetAdder *)context; 333 sa->add(sa->set, start); 334 return TRUE; 335 } 336 337 U_CFUNC void U_EXPORT2 338 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) { 339 if(U_FAILURE(*pErrorCode)) { 340 return; 341 } 342 343 /* add the start code point of each same-value range of the trie */ 344 utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa); 345 346 /* add code points with hardcoded properties, plus the ones following them */ 347 348 /* (none right now, see comment below) */ 349 350 /* 351 * Omit code points with hardcoded specialcasing properties 352 * because we do not build property UnicodeSets for them right now. 353 */ 354 } 355 356 /* data access primitives --------------------------------------------------- */ 357 358 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT)) 359 360 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION) 361 362 /* number of bits in an 8-bit integer value */ 363 static const uint8_t flagsOffset[256]={ 364 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 365 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 366 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 367 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 368 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 369 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 370 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 371 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 372 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 373 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 374 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 375 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 376 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 377 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 378 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 379 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 380 }; 381 382 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx))) 383 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)] 384 385 /* 386 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx). 387 * 388 * @param excWord (in) initial exceptions word 389 * @param idx (in) desired slot index 390 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++; 391 * moved to the last uint16_t of the value, use +1 for beginning of next slot 392 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified 393 */ 394 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \ 395 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \ 396 (pExc16)+=SLOT_OFFSET(excWord, idx); \ 397 (value)=*pExc16; \ 398 } else { \ 399 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \ 400 (value)=*pExc16++; \ 401 (value)=((value)<<16)|*pExc16; \ 402 } 403 404 /* simple case mappings ----------------------------------------------------- */ 405 406 U_CAPI UChar32 U_EXPORT2 407 ucase_tolower(const UCaseProps *csp, UChar32 c) { 408 uint16_t props=UTRIE2_GET16(&csp->trie, c); 409 if(!PROPS_HAS_EXCEPTION(props)) { 410 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { 411 c+=UCASE_GET_DELTA(props); 412 } 413 } else { 414 const uint16_t *pe=GET_EXCEPTIONS(csp, props); 415 uint16_t excWord=*pe++; 416 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 417 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c); 418 } 419 } 420 return c; 421 } 422 423 U_CAPI UChar32 U_EXPORT2 424 ucase_toupper(const UCaseProps *csp, UChar32 c) { 425 uint16_t props=UTRIE2_GET16(&csp->trie, c); 426 if(!PROPS_HAS_EXCEPTION(props)) { 427 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { 428 c+=UCASE_GET_DELTA(props); 429 } 430 } else { 431 const uint16_t *pe=GET_EXCEPTIONS(csp, props); 432 uint16_t excWord=*pe++; 433 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { 434 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c); 435 } 436 } 437 return c; 438 } 439 440 U_CAPI UChar32 U_EXPORT2 441 ucase_totitle(const UCaseProps *csp, UChar32 c) { 442 uint16_t props=UTRIE2_GET16(&csp->trie, c); 443 if(!PROPS_HAS_EXCEPTION(props)) { 444 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { 445 c+=UCASE_GET_DELTA(props); 446 } 447 } else { 448 const uint16_t *pe=GET_EXCEPTIONS(csp, props); 449 uint16_t excWord=*pe++; 450 int32_t idx; 451 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) { 452 idx=UCASE_EXC_TITLE; 453 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { 454 idx=UCASE_EXC_UPPER; 455 } else { 456 return c; 457 } 458 GET_SLOT_VALUE(excWord, idx, pe, c); 459 } 460 return c; 461 } 462 463 static const UChar iDot[2] = { 0x69, 0x307 }; 464 static const UChar jDot[2] = { 0x6a, 0x307 }; 465 static const UChar iOgonekDot[3] = { 0x12f, 0x307 }; 466 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 }; 467 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 }; 468 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 }; 469 470 471 U_CFUNC void U_EXPORT2 472 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) { 473 uint16_t props; 474 475 /* 476 * Hardcode the case closure of i and its relatives and ignore the 477 * data file data for these characters. 478 * The Turkic dotless i and dotted I with their case mapping conditions 479 * and case folding option make the related characters behave specially. 480 * This code matches their closure behavior to their case folding behavior. 481 */ 482 483 switch(c) { 484 case 0x49: 485 /* regular i and I are in one equivalence class */ 486 sa->add(sa->set, 0x69); 487 return; 488 case 0x69: 489 sa->add(sa->set, 0x49); 490 return; 491 case 0x130: 492 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */ 493 sa->addString(sa->set, iDot, 2); 494 return; 495 case 0x131: 496 /* dotless i is in a class by itself */ 497 return; 498 default: 499 /* otherwise use the data file data */ 500 break; 501 } 502 503 props=UTRIE2_GET16(&csp->trie, c); 504 if(!PROPS_HAS_EXCEPTION(props)) { 505 if(UCASE_GET_TYPE(props)!=UCASE_NONE) { 506 /* add the one simple case mapping, no matter what type it is */ 507 int32_t delta=UCASE_GET_DELTA(props); 508 if(delta!=0) { 509 sa->add(sa->set, c+delta); 510 } 511 } 512 } else { 513 /* 514 * c has exceptions, so there may be multiple simple and/or 515 * full case mappings. Add them all. 516 */ 517 const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props); 518 const UChar *closure; 519 uint16_t excWord=*pe++; 520 int32_t idx, closureLength, fullLength, length; 521 522 pe0=pe; 523 524 /* add all simple case mappings */ 525 for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) { 526 if(HAS_SLOT(excWord, idx)) { 527 pe=pe0; 528 GET_SLOT_VALUE(excWord, idx, pe, c); 529 sa->add(sa->set, c); 530 } 531 } 532 533 /* get the closure string pointer & length */ 534 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) { 535 pe=pe0; 536 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength); 537 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */ 538 closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */ 539 } else { 540 closureLength=0; 541 closure=NULL; 542 } 543 544 /* add the full case folding */ 545 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 546 pe=pe0; 547 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength); 548 549 /* start of full case mapping strings */ 550 ++pe; 551 552 fullLength&=0xffff; /* bits 16 and higher are reserved */ 553 554 /* skip the lowercase result string */ 555 pe+=fullLength&UCASE_FULL_LOWER; 556 fullLength>>=4; 557 558 /* add the full case folding string */ 559 length=fullLength&0xf; 560 if(length!=0) { 561 sa->addString(sa->set, (const UChar *)pe, length); 562 pe+=length; 563 } 564 565 /* skip the uppercase and titlecase strings */ 566 fullLength>>=4; 567 pe+=fullLength&0xf; 568 fullLength>>=4; 569 pe+=fullLength; 570 571 closure=(const UChar *)pe; /* behind full case mappings */ 572 } 573 574 /* add each code point in the closure string */ 575 for(idx=0; idx<closureLength;) { 576 U16_NEXT_UNSAFE(closure, idx, c); 577 sa->add(sa->set, c); 578 } 579 } 580 } 581 582 /* 583 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated 584 * must be length>0 and max>0 and length<=max 585 */ 586 static U_INLINE int32_t 587 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) { 588 int32_t c1, c2; 589 590 max-=length; /* we require length<=max, so no need to decrement max in the loop */ 591 do { 592 c1=*s++; 593 c2=*t++; 594 if(c2==0) { 595 return 1; /* reached the end of t but not of s */ 596 } 597 c1-=c2; 598 if(c1!=0) { 599 return c1; /* return difference result */ 600 } 601 } while(--length>0); 602 /* ends with length==0 */ 603 604 if(max==0 || *t==0) { 605 return 0; /* equal to length of both strings */ 606 } else { 607 return -max; /* return lengh difference */ 608 } 609 } 610 611 U_CFUNC UBool U_EXPORT2 612 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) { 613 const UChar *unfold, *p; 614 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth; 615 616 if(csp->unfold==NULL || s==NULL) { 617 return FALSE; /* no reverse case folding data, or no string */ 618 } 619 if(length<=1) { 620 /* the string is too short to find any match */ 621 /* 622 * more precise would be: 623 * if(!u_strHasMoreChar32Than(s, length, 1)) 624 * but this does not make much practical difference because 625 * a single supplementary code point would just not be found 626 */ 627 return FALSE; 628 } 629 630 unfold=csp->unfold; 631 unfoldRows=unfold[UCASE_UNFOLD_ROWS]; 632 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH]; 633 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH]; 634 unfold+=unfoldRowWidth; 635 636 if(length>unfoldStringWidth) { 637 /* the string is too long to find any match */ 638 return FALSE; 639 } 640 641 /* do a binary search for the string */ 642 start=0; 643 limit=unfoldRows; 644 while(start<limit) { 645 i=(start+limit)/2; 646 p=unfold+(i*unfoldRowWidth); 647 result=strcmpMax(s, length, p, unfoldStringWidth); 648 649 if(result==0) { 650 /* found the string: add each code point, and its case closure */ 651 UChar32 c; 652 653 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) { 654 U16_NEXT_UNSAFE(p, i, c); 655 sa->add(sa->set, c); 656 ucase_addCaseClosure(csp, c, sa); 657 } 658 return TRUE; 659 } else if(result<0) { 660 limit=i; 661 } else /* result>0 */ { 662 start=i+1; 663 } 664 } 665 666 return FALSE; /* string not found */ 667 } 668 669 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */ 670 U_CAPI int32_t U_EXPORT2 671 ucase_getType(const UCaseProps *csp, UChar32 c) { 672 uint16_t props=UTRIE2_GET16(&csp->trie, c); 673 return UCASE_GET_TYPE(props); 674 } 675 676 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */ 677 U_CAPI int32_t U_EXPORT2 678 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) { 679 uint16_t props=UTRIE2_GET16(&csp->trie, c); 680 int32_t type=UCASE_GET_TYPE(props); 681 if(props&UCASE_EXCEPTION) { 682 const uint16_t *pe=GET_EXCEPTIONS(csp, props); 683 if(*pe&UCASE_EXC_CASE_IGNORABLE) { 684 type|=4; 685 } 686 } else if(type==UCASE_NONE && (props&UCASE_CASE_IGNORABLE)) { 687 type|=4; 688 } 689 return type; 690 } 691 692 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */ 693 static U_INLINE int32_t 694 getDotType(const UCaseProps *csp, UChar32 c) { 695 uint16_t props=UTRIE2_GET16(&csp->trie, c); 696 if(!PROPS_HAS_EXCEPTION(props)) { 697 return props&UCASE_DOT_MASK; 698 } else { 699 const uint16_t *pe=GET_EXCEPTIONS(csp, props); 700 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK; 701 } 702 } 703 704 U_CAPI UBool U_EXPORT2 705 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) { 706 return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED); 707 } 708 709 U_CAPI UBool U_EXPORT2 710 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) { 711 uint16_t props=UTRIE2_GET16(&csp->trie, c); 712 return (UBool)((props&UCASE_SENSITIVE)!=0); 713 } 714 715 /* string casing ------------------------------------------------------------ */ 716 717 /* 718 * These internal functions form the core of string case mappings. 719 * They map single code points to result code points or strings and take 720 * all necessary conditions (context, locale ID, options) into account. 721 * 722 * They do not iterate over the source or write to the destination 723 * so that the same functions are useful for non-standard string storage, 724 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc. 725 * For the same reason, the "surrounding text" context is passed in as a 726 * UCaseContextIterator which does not make any assumptions about 727 * the underlying storage. 728 * 729 * This section contains helper functions that check for conditions 730 * in the input text surrounding the current code point 731 * according to SpecialCasing.txt. 732 * 733 * Each helper function gets the index 734 * - after the current code point if it looks at following text 735 * - before the current code point if it looks at preceding text 736 * 737 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows: 738 * 739 * Final_Sigma 740 * C is preceded by a sequence consisting of 741 * a cased letter and a case-ignorable sequence, 742 * and C is not followed by a sequence consisting of 743 * an ignorable sequence and then a cased letter. 744 * 745 * More_Above 746 * C is followed by one or more characters of combining class 230 (ABOVE) 747 * in the combining character sequence. 748 * 749 * After_Soft_Dotted 750 * The last preceding character with combining class of zero before C 751 * was Soft_Dotted, 752 * and there is no intervening combining character class 230 (ABOVE). 753 * 754 * Before_Dot 755 * C is followed by combining dot above (U+0307). 756 * Any sequence of characters with a combining class that is neither 0 nor 230 757 * may intervene between the current character and the combining dot above. 758 * 759 * The erratum from 2002-10-31 adds the condition 760 * 761 * After_I 762 * The last preceding base character was an uppercase I, and there is no 763 * intervening combining character class 230 (ABOVE). 764 * 765 * (See Jitterbug 2344 and the comments on After_I below.) 766 * 767 * Helper definitions in Unicode 3.2 UAX 21: 768 * 769 * D1. A character C is defined to be cased 770 * if it meets any of the following criteria: 771 * 772 * - The general category of C is Titlecase Letter (Lt) 773 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase 774 * - Given D = NFD(C), then it is not the case that: 775 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D) 776 * (This third criterium does not add any characters to the list 777 * for Unicode 3.2. Ignored.) 778 * 779 * D2. A character C is defined to be case-ignorable 780 * if it meets either of the following criteria: 781 * 782 * - The general category of C is 783 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or 784 * Letter Modifier (Lm), or Symbol Modifier (Sk) 785 * - C is one of the following characters 786 * U+0027 APOSTROPHE 787 * U+00AD SOFT HYPHEN (SHY) 788 * U+2019 RIGHT SINGLE QUOTATION MARK 789 * (the preferred character for apostrophe) 790 * 791 * D3. A case-ignorable sequence is a sequence of 792 * zero or more case-ignorable characters. 793 */ 794 795 #define is_a(c) ((c)=='a' || (c)=='A') 796 #define is_d(c) ((c)=='d' || (c)=='D') 797 #define is_e(c) ((c)=='e' || (c)=='E') 798 #define is_i(c) ((c)=='i' || (c)=='I') 799 #define is_l(c) ((c)=='l' || (c)=='L') 800 #define is_n(c) ((c)=='n' || (c)=='N') 801 #define is_r(c) ((c)=='r' || (c)=='R') 802 #define is_t(c) ((c)=='t' || (c)=='T') 803 #define is_u(c) ((c)=='u' || (c)=='U') 804 #define is_z(c) ((c)=='z' || (c)=='Z') 805 806 /* separator? */ 807 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0) 808 809 /** 810 * Requires non-NULL locale ID but otherwise does the equivalent of 811 * checking for language codes as if uloc_getLanguage() were called: 812 * Accepts both 2- and 3-letter codes and accepts case variants. 813 */ 814 U_CFUNC int32_t 815 ucase_getCaseLocale(const char *locale, int32_t *locCache) { 816 int32_t result; 817 char c; 818 819 if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) { 820 return result; 821 } 822 823 result=UCASE_LOC_ROOT; 824 825 /* 826 * This function used to use uloc_getLanguage(), but the current code 827 * removes the dependency of this low-level code on uloc implementation code 828 * and is faster because not the whole locale ID has to be 829 * examined and copied/transformed. 830 * 831 * Because this code does not want to depend on uloc, the caller must 832 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault(). 833 */ 834 c=*locale++; 835 if(is_t(c)) { 836 /* tr or tur? */ 837 c=*locale++; 838 if(is_u(c)) { 839 c=*locale++; 840 } 841 if(is_r(c)) { 842 c=*locale; 843 if(is_sep(c)) { 844 result=UCASE_LOC_TURKISH; 845 } 846 } 847 } else if(is_a(c)) { 848 /* az or aze? */ 849 c=*locale++; 850 if(is_z(c)) { 851 c=*locale++; 852 if(is_e(c)) { 853 c=*locale; 854 } 855 if(is_sep(c)) { 856 result=UCASE_LOC_TURKISH; 857 } 858 } 859 } else if(is_l(c)) { 860 /* lt or lit? */ 861 c=*locale++; 862 if(is_i(c)) { 863 c=*locale++; 864 } 865 if(is_t(c)) { 866 c=*locale; 867 if(is_sep(c)) { 868 result=UCASE_LOC_LITHUANIAN; 869 } 870 } 871 } else if(is_n(c)) { 872 /* nl or nld? */ 873 c=*locale++; 874 if(is_l(c)) { 875 c=*locale++; 876 if(is_d(c)) { 877 c=*locale; 878 } 879 if(is_sep(c)) { 880 result=UCASE_LOC_DUTCH; 881 } 882 } 883 } 884 885 if(locCache!=NULL) { 886 *locCache=result; 887 } 888 return result; 889 } 890 891 /* 892 * Is followed by 893 * {case-ignorable}* cased 894 * ? 895 * (dir determines looking forward/backward) 896 * If a character is case-ignorable, it is skipped regardless of whether 897 * it is also cased or not. 898 */ 899 static UBool 900 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) { 901 UChar32 c; 902 903 if(iter==NULL) { 904 return FALSE; 905 } 906 907 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) { 908 int32_t type=ucase_getTypeOrIgnorable(csp, c); 909 if(type&4) { 910 /* case-ignorable, continue with the loop */ 911 } else if(type!=UCASE_NONE) { 912 return TRUE; /* followed by cased letter */ 913 } else { 914 return FALSE; /* uncased and not case-ignorable */ 915 } 916 } 917 918 return FALSE; /* not followed by cased letter */ 919 } 920 921 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */ 922 static UBool 923 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { 924 UChar32 c; 925 int32_t dotType; 926 int8_t dir; 927 928 if(iter==NULL) { 929 return FALSE; 930 } 931 932 for(dir=-1; (c=iter(context, dir))>=0; dir=0) { 933 dotType=getDotType(csp, c); 934 if(dotType==UCASE_SOFT_DOTTED) { 935 return TRUE; /* preceded by TYPE_i */ 936 } else if(dotType!=UCASE_OTHER_ACCENT) { 937 return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */ 938 } 939 } 940 941 return FALSE; /* not preceded by TYPE_i */ 942 } 943 944 /* 945 * See Jitterbug 2344: 946 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above 947 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because 948 * we made those releases compatible with Unicode 3.2 which had not fixed 949 * a related bug in SpecialCasing.txt. 950 * 951 * From the Jitterbug 2344 text: 952 * ... this bug is listed as a Unicode erratum 953 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html 954 * <quote> 955 * There are two errors in SpecialCasing.txt. 956 * 1. Missing semicolons on two lines. ... [irrelevant for ICU] 957 * 2. An incorrect context definition. Correct as follows: 958 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE 959 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE 960 * --- 961 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 962 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE 963 * where the context After_I is defined as: 964 * The last preceding base character was an uppercase I, and there is no 965 * intervening combining character class 230 (ABOVE). 966 * </quote> 967 * 968 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as: 969 * 970 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 971 * # This matches the behavior of the canonically equivalent I-dot_above 972 * 973 * See also the description in this place in older versions of uchar.c (revision 1.100). 974 * 975 * Markus W. Scherer 2003-feb-15 976 */ 977 978 /* Is preceded by base character 'I' with no intervening cc=230 ? */ 979 static UBool 980 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { 981 UChar32 c; 982 int32_t dotType; 983 int8_t dir; 984 985 if(iter==NULL) { 986 return FALSE; 987 } 988 989 for(dir=-1; (c=iter(context, dir))>=0; dir=0) { 990 if(c==0x49) { 991 return TRUE; /* preceded by I */ 992 } 993 dotType=getDotType(csp, c); 994 if(dotType!=UCASE_OTHER_ACCENT) { 995 return FALSE; /* preceded by different base character (not I), or intervening cc==230 */ 996 } 997 } 998 999 return FALSE; /* not preceded by I */ 1000 } 1001 1002 /* Is followed by one or more cc==230 ? */ 1003 static UBool 1004 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { 1005 UChar32 c; 1006 int32_t dotType; 1007 int8_t dir; 1008 1009 if(iter==NULL) { 1010 return FALSE; 1011 } 1012 1013 for(dir=1; (c=iter(context, dir))>=0; dir=0) { 1014 dotType=getDotType(csp, c); 1015 if(dotType==UCASE_ABOVE) { 1016 return TRUE; /* at least one cc==230 following */ 1017 } else if(dotType!=UCASE_OTHER_ACCENT) { 1018 return FALSE; /* next base character, no more cc==230 following */ 1019 } 1020 } 1021 1022 return FALSE; /* no more cc==230 following */ 1023 } 1024 1025 /* Is followed by a dot above (without cc==230 in between) ? */ 1026 static UBool 1027 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) { 1028 UChar32 c; 1029 int32_t dotType; 1030 int8_t dir; 1031 1032 if(iter==NULL) { 1033 return FALSE; 1034 } 1035 1036 for(dir=1; (c=iter(context, dir))>=0; dir=0) { 1037 if(c==0x307) { 1038 return TRUE; 1039 } 1040 dotType=getDotType(csp, c); 1041 if(dotType!=UCASE_OTHER_ACCENT) { 1042 return FALSE; /* next base character or cc==230 in between */ 1043 } 1044 } 1045 1046 return FALSE; /* no dot above following */ 1047 } 1048 1049 U_CAPI int32_t U_EXPORT2 1050 ucase_toFullLower(const UCaseProps *csp, UChar32 c, 1051 UCaseContextIterator *iter, void *context, 1052 const UChar **pString, 1053 const char *locale, int32_t *locCache) 1054 { 1055 UChar32 result=c; 1056 uint16_t props=UTRIE2_GET16(&csp->trie, c); 1057 if(!PROPS_HAS_EXCEPTION(props)) { 1058 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { 1059 result=c+UCASE_GET_DELTA(props); 1060 } 1061 } else { 1062 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; 1063 uint16_t excWord=*pe++; 1064 int32_t full; 1065 1066 pe2=pe; 1067 1068 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { 1069 /* use hardcoded conditions and mappings */ 1070 int32_t loc=ucase_getCaseLocale(locale, locCache); 1071 1072 /* 1073 * Test for conditional mappings first 1074 * (otherwise the unconditional default mappings are always taken), 1075 * then test for characters that have unconditional mappings in SpecialCasing.txt, 1076 * then get the UnicodeData.txt mappings. 1077 */ 1078 if( loc==UCASE_LOC_LITHUANIAN && 1079 /* base characters, find accents above */ 1080 (((c==0x49 || c==0x4a || c==0x12e) && 1081 isFollowedByMoreAbove(csp, iter, context)) || 1082 /* precomposed with accent above, no need to find one */ 1083 (c==0xcc || c==0xcd || c==0x128)) 1084 ) { 1085 /* 1086 # Lithuanian 1087 1088 # Lithuanian retains the dot in a lowercase i when followed by accents. 1089 1090 # Introduce an explicit dot above when lowercasing capital I's and J's 1091 # whenever there are more accents above. 1092 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) 1093 1094 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I 1095 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J 1096 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK 1097 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE 1098 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE 1099 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE 1100 */ 1101 switch(c) { 1102 case 0x49: /* LATIN CAPITAL LETTER I */ 1103 *pString=iDot; 1104 return 2; 1105 case 0x4a: /* LATIN CAPITAL LETTER J */ 1106 *pString=jDot; 1107 return 2; 1108 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ 1109 *pString=iOgonekDot; 1110 return 2; 1111 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ 1112 *pString=iDotGrave; 1113 return 3; 1114 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ 1115 *pString=iDotAcute; 1116 return 3; 1117 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ 1118 *pString=iDotTilde; 1119 return 3; 1120 default: 1121 return 0; /* will not occur */ 1122 } 1123 /* # Turkish and Azeri */ 1124 } else if(loc==UCASE_LOC_TURKISH && c==0x130) { 1125 /* 1126 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 1127 # The following rules handle those cases. 1128 1129 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE 1130 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE 1131 */ 1132 return 0x69; 1133 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) { 1134 /* 1135 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 1136 # This matches the behavior of the canonically equivalent I-dot_above 1137 1138 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 1139 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE 1140 */ 1141 return 0; /* remove the dot (continue without output) */ 1142 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) { 1143 /* 1144 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. 1145 1146 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I 1147 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I 1148 */ 1149 return 0x131; 1150 } else if(c==0x130) { 1151 /* 1152 # Preserve canonical equivalence for I with dot. Turkic is handled below. 1153 1154 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE 1155 */ 1156 *pString=iDot; 1157 return 2; 1158 } else if( c==0x3a3 && 1159 !isFollowedByCasedLetter(csp, iter, context, 1) && 1160 isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */ 1161 ) { 1162 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */ 1163 /* 1164 # Special case for final form of sigma 1165 1166 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA 1167 */ 1168 return 0x3c2; /* greek small final sigma */ 1169 } else { 1170 /* no known conditional special case mapping, use a normal mapping */ 1171 } 1172 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 1173 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); 1174 full&=UCASE_FULL_LOWER; 1175 if(full!=0) { 1176 /* set the output pointer to the lowercase mapping */ 1177 *pString=pe+1; 1178 1179 /* return the string length */ 1180 return full; 1181 } 1182 } 1183 1184 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 1185 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result); 1186 } 1187 } 1188 1189 return (result==c) ? ~result : result; 1190 } 1191 1192 /* internal */ 1193 static int32_t 1194 toUpperOrTitle(const UCaseProps *csp, UChar32 c, 1195 UCaseContextIterator *iter, void *context, 1196 const UChar **pString, 1197 const char *locale, int32_t *locCache, 1198 UBool upperNotTitle) { 1199 UChar32 result=c; 1200 uint16_t props=UTRIE2_GET16(&csp->trie, c); 1201 if(!PROPS_HAS_EXCEPTION(props)) { 1202 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { 1203 result=c+UCASE_GET_DELTA(props); 1204 } 1205 } else { 1206 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; 1207 uint16_t excWord=*pe++; 1208 int32_t full, idx; 1209 1210 pe2=pe; 1211 1212 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { 1213 /* use hardcoded conditions and mappings */ 1214 int32_t loc=ucase_getCaseLocale(locale, locCache); 1215 1216 if(loc==UCASE_LOC_TURKISH && c==0x69) { 1217 /* 1218 # Turkish and Azeri 1219 1220 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 1221 # The following rules handle those cases. 1222 1223 # When uppercasing, i turns into a dotted capital I 1224 1225 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I 1226 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I 1227 */ 1228 return 0x130; 1229 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) { 1230 /* 1231 # Lithuanian 1232 1233 # Lithuanian retains the dot in a lowercase i when followed by accents. 1234 1235 # Remove DOT ABOVE after "i" with upper or titlecase 1236 1237 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE 1238 */ 1239 return 0; /* remove the dot (continue without output) */ 1240 } else { 1241 /* no known conditional special case mapping, use a normal mapping */ 1242 } 1243 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 1244 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); 1245 1246 /* start of full case mapping strings */ 1247 ++pe; 1248 1249 /* skip the lowercase and case-folding result strings */ 1250 pe+=full&UCASE_FULL_LOWER; 1251 full>>=4; 1252 pe+=full&0xf; 1253 full>>=4; 1254 1255 if(upperNotTitle) { 1256 full&=0xf; 1257 } else { 1258 /* skip the uppercase result string */ 1259 pe+=full&0xf; 1260 full=(full>>4)&0xf; 1261 } 1262 1263 if(full!=0) { 1264 /* set the output pointer to the result string */ 1265 *pString=pe; 1266 1267 /* return the string length */ 1268 return full; 1269 } 1270 } 1271 1272 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) { 1273 idx=UCASE_EXC_TITLE; 1274 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { 1275 /* here, titlecase is same as uppercase */ 1276 idx=UCASE_EXC_UPPER; 1277 } else { 1278 return ~c; 1279 } 1280 GET_SLOT_VALUE(excWord, idx, pe2, result); 1281 } 1282 1283 return (result==c) ? ~result : result; 1284 } 1285 1286 U_CAPI int32_t U_EXPORT2 1287 ucase_toFullUpper(const UCaseProps *csp, UChar32 c, 1288 UCaseContextIterator *iter, void *context, 1289 const UChar **pString, 1290 const char *locale, int32_t *locCache) { 1291 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE); 1292 } 1293 1294 U_CAPI int32_t U_EXPORT2 1295 ucase_toFullTitle(const UCaseProps *csp, UChar32 c, 1296 UCaseContextIterator *iter, void *context, 1297 const UChar **pString, 1298 const char *locale, int32_t *locCache) { 1299 return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE); 1300 } 1301 1302 /* case folding ------------------------------------------------------------- */ 1303 1304 /* 1305 * Case folding is similar to lowercasing. 1306 * The result may be a simple mapping, i.e., a single code point, or 1307 * a full mapping, i.e., a string. 1308 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping, 1309 * then only the lowercase mapping is stored. 1310 * 1311 * Some special cases are hardcoded because their conditions cannot be 1312 * parsed and processed from CaseFolding.txt. 1313 * 1314 * Unicode 3.2 CaseFolding.txt specifies for its status field: 1315 1316 # C: common case folding, common mappings shared by both simple and full mappings. 1317 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. 1318 # S: simple case folding, mappings to single characters where different from F. 1319 # T: special case for uppercase I and dotted uppercase I 1320 # - For non-Turkic languages, this mapping is normally not used. 1321 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. 1322 # 1323 # Usage: 1324 # A. To do a simple case folding, use the mappings with status C + S. 1325 # B. To do a full case folding, use the mappings with status C + F. 1326 # 1327 # The mappings with status T can be used or omitted depending on the desired case-folding 1328 # behavior. (The default option is to exclude them.) 1329 1330 * Unicode 3.2 has 'T' mappings as follows: 1331 1332 0049; T; 0131; # LATIN CAPITAL LETTER I 1333 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE 1334 1335 * while the default mappings for these code points are: 1336 1337 0049; C; 0069; # LATIN CAPITAL LETTER I 1338 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE 1339 1340 * U+0130 has no simple case folding (simple-case-folds to itself). 1341 */ 1342 1343 /* return the simple case folding mapping for c */ 1344 U_CAPI UChar32 U_EXPORT2 1345 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) { 1346 uint16_t props=UTRIE2_GET16(&csp->trie, c); 1347 if(!PROPS_HAS_EXCEPTION(props)) { 1348 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { 1349 c+=UCASE_GET_DELTA(props); 1350 } 1351 } else { 1352 const uint16_t *pe=GET_EXCEPTIONS(csp, props); 1353 uint16_t excWord=*pe++; 1354 int32_t idx; 1355 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { 1356 /* special case folding mappings, hardcoded */ 1357 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { 1358 /* default mappings */ 1359 if(c==0x49) { 1360 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ 1361 return 0x69; 1362 } else if(c==0x130) { 1363 /* no simple case folding for U+0130 */ 1364 return c; 1365 } 1366 } else { 1367 /* Turkic mappings */ 1368 if(c==0x49) { 1369 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ 1370 return 0x131; 1371 } else if(c==0x130) { 1372 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1373 return 0x69; 1374 } 1375 } 1376 } 1377 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { 1378 idx=UCASE_EXC_FOLD; 1379 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 1380 idx=UCASE_EXC_LOWER; 1381 } else { 1382 return c; 1383 } 1384 GET_SLOT_VALUE(excWord, idx, pe, c); 1385 } 1386 return c; 1387 } 1388 1389 /* 1390 * Issue for canonical caseless match (UAX #21): 1391 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve 1392 * canonical equivalence, unlike default-option casefolding. 1393 * For example, I-grave and I + grave fold to strings that are not canonically 1394 * equivalent. 1395 * For more details, see the comment in unorm_compare() in unorm.cpp 1396 * and the intermediate prototype changes for Jitterbug 2021. 1397 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.) 1398 * 1399 * This did not get fixed because it appears that it is not possible to fix 1400 * it for uppercase and lowercase characters (I-grave vs. i-grave) 1401 * together in a way that they still fold to common result strings. 1402 */ 1403 1404 U_CAPI int32_t U_EXPORT2 1405 ucase_toFullFolding(const UCaseProps *csp, UChar32 c, 1406 const UChar **pString, 1407 uint32_t options) 1408 { 1409 UChar32 result=c; 1410 uint16_t props=UTRIE2_GET16(&csp->trie, c); 1411 if(!PROPS_HAS_EXCEPTION(props)) { 1412 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { 1413 result=c+UCASE_GET_DELTA(props); 1414 } 1415 } else { 1416 const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2; 1417 uint16_t excWord=*pe++; 1418 int32_t full, idx; 1419 1420 pe2=pe; 1421 1422 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { 1423 /* use hardcoded conditions and mappings */ 1424 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { 1425 /* default mappings */ 1426 if(c==0x49) { 1427 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ 1428 return 0x69; 1429 } else if(c==0x130) { 1430 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1431 *pString=iDot; 1432 return 2; 1433 } 1434 } else { 1435 /* Turkic mappings */ 1436 if(c==0x49) { 1437 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ 1438 return 0x131; 1439 } else if(c==0x130) { 1440 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1441 return 0x69; 1442 } 1443 } 1444 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 1445 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); 1446 1447 /* start of full case mapping strings */ 1448 ++pe; 1449 1450 /* skip the lowercase result string */ 1451 pe+=full&UCASE_FULL_LOWER; 1452 full=(full>>4)&0xf; 1453 1454 if(full!=0) { 1455 /* set the output pointer to the result string */ 1456 *pString=pe; 1457 1458 /* return the string length */ 1459 return full; 1460 } 1461 } 1462 1463 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { 1464 idx=UCASE_EXC_FOLD; 1465 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 1466 idx=UCASE_EXC_LOWER; 1467 } else { 1468 return ~c; 1469 } 1470 GET_SLOT_VALUE(excWord, idx, pe2, result); 1471 } 1472 1473 return (result==c) ? ~result : result; 1474 } 1475 1476 /* case mapping properties API ---------------------------------------------- */ 1477 1478 /* get the UCaseProps singleton, or else its dummy, once and for all */ 1479 #if !UCASE_HARDCODE_DATA 1480 static const UCaseProps * 1481 getCaseProps() { 1482 /* 1483 * This lazy intialization with double-checked locking (without mutex protection for 1484 * the initial check) is transiently unsafe under certain circumstances. 1485 * Check the readme and use u_init() if necessary. 1486 */ 1487 1488 /* the initial check is performed by the GET_CASE_PROPS() macro */ 1489 const UCaseProps *csp; 1490 UErrorCode errorCode=U_ZERO_ERROR; 1491 1492 csp=ucase_getSingleton(&errorCode); 1493 if(U_FAILURE(errorCode)) { 1494 errorCode=U_ZERO_ERROR; 1495 csp=ucase_getDummy(&errorCode); 1496 if(U_FAILURE(errorCode)) { 1497 return NULL; 1498 } 1499 } 1500 1501 return csp; 1502 } 1503 #endif 1504 1505 /* 1506 * In ICU 3.0, most Unicode properties were loaded from uprops.icu. 1507 * ICU 3.2 adds ucase.icu for case mapping properties. 1508 * ICU 3.4 adds ubidi.icu for bidi/shaping properties and 1509 * removes case/bidi/shaping properties from uprops.icu. 1510 * 1511 * Loading of uprops.icu was never mutex-protected and required u_init() 1512 * for thread safety. 1513 * In order to maintain performance for all such properties, 1514 * ucase.icu and ubidi.icu are loaded lazily, without mutexing. 1515 * u_init() will try to load them for thread safety, 1516 * but u_init() will not fail if they are missing. 1517 * 1518 * uchar.c maintains a tri-state flag for (not loaded/loaded/failed to load) 1519 * and an error code for load failure. 1520 * Instead, here we try to load at most once. 1521 * If it works, we use the resulting singleton object. 1522 * If it fails, then we get a dummy object, which always works unless 1523 * we are seriously out of memory. 1524 * After the first try, we have a never-changing pointer to either the 1525 * real singleton or the dummy. 1526 * 1527 * This method is used in Unicode properties APIs (uchar.h) that 1528 * do not have a service object and also do not have an error code parameter. 1529 * Other API implementations get the singleton themselves 1530 * (with mutexing), store it in the service object, and report errors. 1531 * 1532 * TODO: Remove this support for non-hardcoded data. u_init() is publicly 1533 * advertised as not being required for thread safety, we cannot 1534 * revert to unsafe data loading. 1535 */ 1536 #if !UCASE_HARDCODE_DATA 1537 #define GET_CASE_PROPS() (gCsp!=NULL ? gCsp : getCaseProps()) 1538 #else 1539 #define GET_CASE_PROPS() &ucase_props_singleton 1540 #endif 1541 1542 /* public API (see uchar.h) */ 1543 1544 U_CAPI UBool U_EXPORT2 1545 u_isULowercase(UChar32 c) { 1546 return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c)); 1547 } 1548 1549 U_CAPI UBool U_EXPORT2 1550 u_isUUppercase(UChar32 c) { 1551 return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c)); 1552 } 1553 1554 /* Transforms the Unicode character to its lower case equivalent.*/ 1555 U_CAPI UChar32 U_EXPORT2 1556 u_tolower(UChar32 c) { 1557 return ucase_tolower(GET_CASE_PROPS(), c); 1558 } 1559 1560 /* Transforms the Unicode character to its upper case equivalent.*/ 1561 U_CAPI UChar32 U_EXPORT2 1562 u_toupper(UChar32 c) { 1563 return ucase_toupper(GET_CASE_PROPS(), c); 1564 } 1565 1566 /* Transforms the Unicode character to its title case equivalent.*/ 1567 U_CAPI UChar32 U_EXPORT2 1568 u_totitle(UChar32 c) { 1569 return ucase_totitle(GET_CASE_PROPS(), c); 1570 } 1571 1572 /* return the simple case folding mapping for c */ 1573 U_CAPI UChar32 U_EXPORT2 1574 u_foldCase(UChar32 c, uint32_t options) { 1575 return ucase_fold(GET_CASE_PROPS(), c, options); 1576 } 1577 1578 U_CFUNC int32_t U_EXPORT2 1579 ucase_hasBinaryProperty(UChar32 c, UProperty which) { 1580 /* case mapping properties */ 1581 const UChar *resultString; 1582 int32_t locCache; 1583 const UCaseProps *csp=GET_CASE_PROPS(); 1584 if(csp==NULL) { 1585 return FALSE; 1586 } 1587 switch(which) { 1588 case UCHAR_LOWERCASE: 1589 return (UBool)(UCASE_LOWER==ucase_getType(csp, c)); 1590 case UCHAR_UPPERCASE: 1591 return (UBool)(UCASE_UPPER==ucase_getType(csp, c)); 1592 case UCHAR_SOFT_DOTTED: 1593 return ucase_isSoftDotted(csp, c); 1594 case UCHAR_CASE_SENSITIVE: 1595 return ucase_isCaseSensitive(csp, c); 1596 case UCHAR_CASED: 1597 return (UBool)(UCASE_NONE!=ucase_getType(csp, c)); 1598 case UCHAR_CASE_IGNORABLE: 1599 return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2); 1600 /* 1601 * Note: The following Changes_When_Xyz are defined as testing whether 1602 * the NFD form of the input changes when Xyz-case-mapped. 1603 * However, this simpler implementation of these properties, 1604 * ignoring NFD, passes the tests. 1605 * The implementation needs to be changed if the tests start failing. 1606 * When that happens, optimizations should be used to work with the 1607 * per-single-code point ucase_toFullXyz() functions unless 1608 * the NFD form has more than one code point, 1609 * and the property starts set needs to be the union of the 1610 * start sets for normalization and case mappings. 1611 */ 1612 case UCHAR_CHANGES_WHEN_LOWERCASED: 1613 locCache=UCASE_LOC_ROOT; 1614 return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0); 1615 case UCHAR_CHANGES_WHEN_UPPERCASED: 1616 locCache=UCASE_LOC_ROOT; 1617 return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0); 1618 case UCHAR_CHANGES_WHEN_TITLECASED: 1619 locCache=UCASE_LOC_ROOT; 1620 return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0); 1621 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */ 1622 case UCHAR_CHANGES_WHEN_CASEMAPPED: 1623 locCache=UCASE_LOC_ROOT; 1624 return (UBool)( 1625 ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 || 1626 ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 || 1627 ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0); 1628 default: 1629 return FALSE; 1630 } 1631 } 1632