1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2004-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: ucase.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2004aug30 16 * created by: Markus W. Scherer 17 * 18 * Low-level Unicode character/string case mapping code. 19 * Much code moved here (and modified) from uchar.c. 20 */ 21 22 #include "unicode/utypes.h" 23 #include "unicode/unistr.h" 24 #include "unicode/uset.h" 25 #include "unicode/udata.h" /* UDataInfo */ 26 #include "unicode/utf16.h" 27 #include "ucmndata.h" /* DataHeader */ 28 #include "udatamem.h" 29 #include "umutex.h" 30 #include "uassert.h" 31 #include "cmemory.h" 32 #include "utrie2.h" 33 #include "ucase.h" 34 35 struct UCaseProps { 36 UDataMemory *mem; 37 const int32_t *indexes; 38 const uint16_t *exceptions; 39 const uint16_t *unfold; 40 41 UTrie2 trie; 42 uint8_t formatVersion[4]; 43 }; 44 45 /* ucase_props_data.h is machine-generated by gencase --csource */ 46 #define INCLUDED_FROM_UCASE_CPP 47 #include "ucase_props_data.h" 48 49 /* set of property starts for UnicodeSet ------------------------------------ */ 50 51 static UBool U_CALLCONV 52 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) { 53 /* add the start code point to the USet */ 54 const USetAdder *sa=(const USetAdder *)context; 55 sa->add(sa->set, start); 56 return TRUE; 57 } 58 59 U_CFUNC void U_EXPORT2 60 ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) { 61 if(U_FAILURE(*pErrorCode)) { 62 return; 63 } 64 65 /* add the start code point of each same-value range of the trie */ 66 utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa); 67 68 /* add code points with hardcoded properties, plus the ones following them */ 69 70 /* (none right now, see comment below) */ 71 72 /* 73 * Omit code points with hardcoded specialcasing properties 74 * because we do not build property UnicodeSets for them right now. 75 */ 76 } 77 78 /* data access primitives --------------------------------------------------- */ 79 80 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT)) 81 82 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION) 83 84 /* number of bits in an 8-bit integer value */ 85 static const uint8_t flagsOffset[256]={ 86 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 87 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 88 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 89 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 90 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 91 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 92 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 93 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 94 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 95 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 96 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 97 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 98 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 99 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 100 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 101 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 102 }; 103 104 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx))) 105 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)] 106 107 /* 108 * Get the value of an optional-value slot where HAS_SLOT(excWord, idx). 109 * 110 * @param excWord (in) initial exceptions word 111 * @param idx (in) desired slot index 112 * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++; 113 * moved to the last uint16_t of the value, use +1 for beginning of next slot 114 * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified 115 */ 116 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \ 117 if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \ 118 (pExc16)+=SLOT_OFFSET(excWord, idx); \ 119 (value)=*pExc16; \ 120 } else { \ 121 (pExc16)+=2*SLOT_OFFSET(excWord, idx); \ 122 (value)=*pExc16++; \ 123 (value)=((value)<<16)|*pExc16; \ 124 } 125 126 /* simple case mappings ----------------------------------------------------- */ 127 128 U_CAPI UChar32 U_EXPORT2 129 ucase_tolower(UChar32 c) { 130 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 131 if(!PROPS_HAS_EXCEPTION(props)) { 132 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { 133 c+=UCASE_GET_DELTA(props); 134 } 135 } else { 136 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); 137 uint16_t excWord=*pe++; 138 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 139 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c); 140 } 141 } 142 return c; 143 } 144 145 U_CAPI UChar32 U_EXPORT2 146 ucase_toupper(UChar32 c) { 147 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 148 if(!PROPS_HAS_EXCEPTION(props)) { 149 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { 150 c+=UCASE_GET_DELTA(props); 151 } 152 } else { 153 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); 154 uint16_t excWord=*pe++; 155 if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { 156 GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c); 157 } 158 } 159 return c; 160 } 161 162 U_CAPI UChar32 U_EXPORT2 163 ucase_totitle(UChar32 c) { 164 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 165 if(!PROPS_HAS_EXCEPTION(props)) { 166 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { 167 c+=UCASE_GET_DELTA(props); 168 } 169 } else { 170 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); 171 uint16_t excWord=*pe++; 172 int32_t idx; 173 if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) { 174 idx=UCASE_EXC_TITLE; 175 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { 176 idx=UCASE_EXC_UPPER; 177 } else { 178 return c; 179 } 180 GET_SLOT_VALUE(excWord, idx, pe, c); 181 } 182 return c; 183 } 184 185 static const UChar iDot[2] = { 0x69, 0x307 }; 186 static const UChar jDot[2] = { 0x6a, 0x307 }; 187 static const UChar iOgonekDot[3] = { 0x12f, 0x307 }; 188 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 }; 189 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 }; 190 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 }; 191 192 193 U_CFUNC void U_EXPORT2 194 ucase_addCaseClosure(UChar32 c, const USetAdder *sa) { 195 uint16_t props; 196 197 /* 198 * Hardcode the case closure of i and its relatives and ignore the 199 * data file data for these characters. 200 * The Turkic dotless i and dotted I with their case mapping conditions 201 * and case folding option make the related characters behave specially. 202 * This code matches their closure behavior to their case folding behavior. 203 */ 204 205 switch(c) { 206 case 0x49: 207 /* regular i and I are in one equivalence class */ 208 sa->add(sa->set, 0x69); 209 return; 210 case 0x69: 211 sa->add(sa->set, 0x49); 212 return; 213 case 0x130: 214 /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */ 215 sa->addString(sa->set, iDot, 2); 216 return; 217 case 0x131: 218 /* dotless i is in a class by itself */ 219 return; 220 default: 221 /* otherwise use the data file data */ 222 break; 223 } 224 225 props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 226 if(!PROPS_HAS_EXCEPTION(props)) { 227 if(UCASE_GET_TYPE(props)!=UCASE_NONE) { 228 /* add the one simple case mapping, no matter what type it is */ 229 int32_t delta=UCASE_GET_DELTA(props); 230 if(delta!=0) { 231 sa->add(sa->set, c+delta); 232 } 233 } 234 } else { 235 /* 236 * c has exceptions, so there may be multiple simple and/or 237 * full case mappings. Add them all. 238 */ 239 const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); 240 const UChar *closure; 241 uint16_t excWord=*pe++; 242 int32_t idx, closureLength, fullLength, length; 243 244 pe0=pe; 245 246 /* add all simple case mappings */ 247 for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) { 248 if(HAS_SLOT(excWord, idx)) { 249 pe=pe0; 250 GET_SLOT_VALUE(excWord, idx, pe, c); 251 sa->add(sa->set, c); 252 } 253 } 254 255 /* get the closure string pointer & length */ 256 if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) { 257 pe=pe0; 258 GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength); 259 closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */ 260 closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */ 261 } else { 262 closureLength=0; 263 closure=NULL; 264 } 265 266 /* add the full case folding */ 267 if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 268 pe=pe0; 269 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength); 270 271 /* start of full case mapping strings */ 272 ++pe; 273 274 fullLength&=0xffff; /* bits 16 and higher are reserved */ 275 276 /* skip the lowercase result string */ 277 pe+=fullLength&UCASE_FULL_LOWER; 278 fullLength>>=4; 279 280 /* add the full case folding string */ 281 length=fullLength&0xf; 282 if(length!=0) { 283 sa->addString(sa->set, (const UChar *)pe, length); 284 pe+=length; 285 } 286 287 /* skip the uppercase and titlecase strings */ 288 fullLength>>=4; 289 pe+=fullLength&0xf; 290 fullLength>>=4; 291 pe+=fullLength; 292 293 closure=(const UChar *)pe; /* behind full case mappings */ 294 } 295 296 /* add each code point in the closure string */ 297 for(idx=0; idx<closureLength;) { 298 U16_NEXT_UNSAFE(closure, idx, c); 299 sa->add(sa->set, c); 300 } 301 } 302 } 303 304 /* 305 * compare s, which has a length, with t, which has a maximum length or is NUL-terminated 306 * must be length>0 and max>0 and length<=max 307 */ 308 static inline int32_t 309 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) { 310 int32_t c1, c2; 311 312 max-=length; /* we require length<=max, so no need to decrement max in the loop */ 313 do { 314 c1=*s++; 315 c2=*t++; 316 if(c2==0) { 317 return 1; /* reached the end of t but not of s */ 318 } 319 c1-=c2; 320 if(c1!=0) { 321 return c1; /* return difference result */ 322 } 323 } while(--length>0); 324 /* ends with length==0 */ 325 326 if(max==0 || *t==0) { 327 return 0; /* equal to length of both strings */ 328 } else { 329 return -max; /* return lengh difference */ 330 } 331 } 332 333 U_CFUNC UBool U_EXPORT2 334 ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa) { 335 int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth; 336 337 if(ucase_props_singleton.unfold==NULL || s==NULL) { 338 return FALSE; /* no reverse case folding data, or no string */ 339 } 340 if(length<=1) { 341 /* the string is too short to find any match */ 342 /* 343 * more precise would be: 344 * if(!u_strHasMoreChar32Than(s, length, 1)) 345 * but this does not make much practical difference because 346 * a single supplementary code point would just not be found 347 */ 348 return FALSE; 349 } 350 351 const uint16_t *unfold=ucase_props_singleton.unfold; 352 unfoldRows=unfold[UCASE_UNFOLD_ROWS]; 353 unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH]; 354 unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH]; 355 unfold+=unfoldRowWidth; 356 357 if(length>unfoldStringWidth) { 358 /* the string is too long to find any match */ 359 return FALSE; 360 } 361 362 /* do a binary search for the string */ 363 start=0; 364 limit=unfoldRows; 365 while(start<limit) { 366 i=(start+limit)/2; 367 const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth)); 368 result=strcmpMax(s, length, p, unfoldStringWidth); 369 370 if(result==0) { 371 /* found the string: add each code point, and its case closure */ 372 UChar32 c; 373 374 for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) { 375 U16_NEXT_UNSAFE(p, i, c); 376 sa->add(sa->set, c); 377 ucase_addCaseClosure(c, sa); 378 } 379 return TRUE; 380 } else if(result<0) { 381 limit=i; 382 } else /* result>0 */ { 383 start=i+1; 384 } 385 } 386 387 return FALSE; /* string not found */ 388 } 389 390 U_NAMESPACE_BEGIN 391 392 FullCaseFoldingIterator::FullCaseFoldingIterator() 393 : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)), 394 unfoldRows(unfold[UCASE_UNFOLD_ROWS]), 395 unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]), 396 unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]), 397 currentRow(0), 398 rowCpIndex(unfoldStringWidth) { 399 unfold+=unfoldRowWidth; 400 } 401 402 UChar32 403 FullCaseFoldingIterator::next(UnicodeString &full) { 404 // Advance past the last-delivered code point. 405 const UChar *p=unfold+(currentRow*unfoldRowWidth); 406 if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) { 407 ++currentRow; 408 p+=unfoldRowWidth; 409 rowCpIndex=unfoldStringWidth; 410 } 411 if(currentRow>=unfoldRows) { return U_SENTINEL; } 412 // Set "full" to the NUL-terminated string in the first unfold column. 413 int32_t length=unfoldStringWidth; 414 while(length>0 && p[length-1]==0) { --length; } 415 full.setTo(FALSE, p, length); 416 // Return the code point. 417 UChar32 c; 418 U16_NEXT_UNSAFE(p, rowCpIndex, c); 419 return c; 420 } 421 422 U_NAMESPACE_END 423 424 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */ 425 U_CAPI int32_t U_EXPORT2 426 ucase_getType(UChar32 c) { 427 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 428 return UCASE_GET_TYPE(props); 429 } 430 431 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */ 432 U_CAPI int32_t U_EXPORT2 433 ucase_getTypeOrIgnorable(UChar32 c) { 434 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 435 return UCASE_GET_TYPE_AND_IGNORABLE(props); 436 } 437 438 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */ 439 static inline int32_t 440 getDotType(UChar32 c) { 441 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 442 if(!PROPS_HAS_EXCEPTION(props)) { 443 return props&UCASE_DOT_MASK; 444 } else { 445 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); 446 return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK; 447 } 448 } 449 450 U_CAPI UBool U_EXPORT2 451 ucase_isSoftDotted(UChar32 c) { 452 return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED); 453 } 454 455 U_CAPI UBool U_EXPORT2 456 ucase_isCaseSensitive(UChar32 c) { 457 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 458 return (UBool)((props&UCASE_SENSITIVE)!=0); 459 } 460 461 /* string casing ------------------------------------------------------------ */ 462 463 /* 464 * These internal functions form the core of string case mappings. 465 * They map single code points to result code points or strings and take 466 * all necessary conditions (context, locale ID, options) into account. 467 * 468 * They do not iterate over the source or write to the destination 469 * so that the same functions are useful for non-standard string storage, 470 * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc. 471 * For the same reason, the "surrounding text" context is passed in as a 472 * UCaseContextIterator which does not make any assumptions about 473 * the underlying storage. 474 * 475 * This section contains helper functions that check for conditions 476 * in the input text surrounding the current code point 477 * according to SpecialCasing.txt. 478 * 479 * Each helper function gets the index 480 * - after the current code point if it looks at following text 481 * - before the current code point if it looks at preceding text 482 * 483 * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows: 484 * 485 * Final_Sigma 486 * C is preceded by a sequence consisting of 487 * a cased letter and a case-ignorable sequence, 488 * and C is not followed by a sequence consisting of 489 * an ignorable sequence and then a cased letter. 490 * 491 * More_Above 492 * C is followed by one or more characters of combining class 230 (ABOVE) 493 * in the combining character sequence. 494 * 495 * After_Soft_Dotted 496 * The last preceding character with combining class of zero before C 497 * was Soft_Dotted, 498 * and there is no intervening combining character class 230 (ABOVE). 499 * 500 * Before_Dot 501 * C is followed by combining dot above (U+0307). 502 * Any sequence of characters with a combining class that is neither 0 nor 230 503 * may intervene between the current character and the combining dot above. 504 * 505 * The erratum from 2002-10-31 adds the condition 506 * 507 * After_I 508 * The last preceding base character was an uppercase I, and there is no 509 * intervening combining character class 230 (ABOVE). 510 * 511 * (See Jitterbug 2344 and the comments on After_I below.) 512 * 513 * Helper definitions in Unicode 3.2 UAX 21: 514 * 515 * D1. A character C is defined to be cased 516 * if it meets any of the following criteria: 517 * 518 * - The general category of C is Titlecase Letter (Lt) 519 * - In [CoreProps], C has one of the properties Uppercase, or Lowercase 520 * - Given D = NFD(C), then it is not the case that: 521 * D = UCD_lower(D) = UCD_upper(D) = UCD_title(D) 522 * (This third criterium does not add any characters to the list 523 * for Unicode 3.2. Ignored.) 524 * 525 * D2. A character C is defined to be case-ignorable 526 * if it meets either of the following criteria: 527 * 528 * - The general category of C is 529 * Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or 530 * Letter Modifier (Lm), or Symbol Modifier (Sk) 531 * - C is one of the following characters 532 * U+0027 APOSTROPHE 533 * U+00AD SOFT HYPHEN (SHY) 534 * U+2019 RIGHT SINGLE QUOTATION MARK 535 * (the preferred character for apostrophe) 536 * 537 * D3. A case-ignorable sequence is a sequence of 538 * zero or more case-ignorable characters. 539 */ 540 541 #define is_d(c) ((c)=='d' || (c)=='D') 542 #define is_e(c) ((c)=='e' || (c)=='E') 543 #define is_i(c) ((c)=='i' || (c)=='I') 544 #define is_l(c) ((c)=='l' || (c)=='L') 545 #define is_r(c) ((c)=='r' || (c)=='R') 546 #define is_t(c) ((c)=='t' || (c)=='T') 547 #define is_u(c) ((c)=='u' || (c)=='U') 548 #define is_z(c) ((c)=='z' || (c)=='Z') 549 550 /* separator? */ 551 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0) 552 553 /** 554 * Requires non-NULL locale ID but otherwise does the equivalent of 555 * checking for language codes as if uloc_getLanguage() were called: 556 * Accepts both 2- and 3-letter codes and accepts case variants. 557 */ 558 U_CFUNC int32_t 559 ucase_getCaseLocale(const char *locale) { 560 /* 561 * This function used to use uloc_getLanguage(), but the current code 562 * removes the dependency of this low-level code on uloc implementation code 563 * and is faster because not the whole locale ID has to be 564 * examined and copied/transformed. 565 * 566 * Because this code does not want to depend on uloc, the caller must 567 * pass in a non-NULL locale, i.e., may need to call uloc_getDefault(). 568 */ 569 char c=*locale++; 570 // Fastpath for English "en" which is often used for default (=root locale) case mappings, 571 // and for Chinese "zh": Very common but no special case mapping behavior. 572 // Then check lowercase vs. uppercase to reduce the number of comparisons 573 // for other locales without special behavior. 574 if(c=='e') { 575 /* el or ell? */ 576 c=*locale++; 577 if(is_l(c)) { 578 c=*locale++; 579 if(is_l(c)) { 580 c=*locale; 581 } 582 if(is_sep(c)) { 583 return UCASE_LOC_GREEK; 584 } 585 } 586 // en, es, ... -> root 587 } else if(c=='z') { 588 return UCASE_LOC_ROOT; 589 #if U_CHARSET_FAMILY==U_ASCII_FAMILY 590 } else if(c>='a') { // ASCII a-z = 0x61..0x7a, after A-Z 591 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY 592 } else if(c<='z') { // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z 593 #else 594 # error Unknown charset family! 595 #endif 596 // lowercase c 597 if(c=='t') { 598 /* tr or tur? */ 599 c=*locale++; 600 if(is_u(c)) { 601 c=*locale++; 602 } 603 if(is_r(c)) { 604 c=*locale; 605 if(is_sep(c)) { 606 return UCASE_LOC_TURKISH; 607 } 608 } 609 } else if(c=='a') { 610 /* az or aze? */ 611 c=*locale++; 612 if(is_z(c)) { 613 c=*locale++; 614 if(is_e(c)) { 615 c=*locale; 616 } 617 if(is_sep(c)) { 618 return UCASE_LOC_TURKISH; 619 } 620 } 621 } else if(c=='l') { 622 /* lt or lit? */ 623 c=*locale++; 624 if(is_i(c)) { 625 c=*locale++; 626 } 627 if(is_t(c)) { 628 c=*locale; 629 if(is_sep(c)) { 630 return UCASE_LOC_LITHUANIAN; 631 } 632 } 633 } else if(c=='n') { 634 /* nl or nld? */ 635 c=*locale++; 636 if(is_l(c)) { 637 c=*locale++; 638 if(is_d(c)) { 639 c=*locale; 640 } 641 if(is_sep(c)) { 642 return UCASE_LOC_DUTCH; 643 } 644 } 645 } 646 } else { 647 // uppercase c 648 // Same code as for lowercase c but also check for 'E'. 649 if(c=='T') { 650 /* tr or tur? */ 651 c=*locale++; 652 if(is_u(c)) { 653 c=*locale++; 654 } 655 if(is_r(c)) { 656 c=*locale; 657 if(is_sep(c)) { 658 return UCASE_LOC_TURKISH; 659 } 660 } 661 } else if(c=='A') { 662 /* az or aze? */ 663 c=*locale++; 664 if(is_z(c)) { 665 c=*locale++; 666 if(is_e(c)) { 667 c=*locale; 668 } 669 if(is_sep(c)) { 670 return UCASE_LOC_TURKISH; 671 } 672 } 673 } else if(c=='L') { 674 /* lt or lit? */ 675 c=*locale++; 676 if(is_i(c)) { 677 c=*locale++; 678 } 679 if(is_t(c)) { 680 c=*locale; 681 if(is_sep(c)) { 682 return UCASE_LOC_LITHUANIAN; 683 } 684 } 685 } else if(c=='E') { 686 /* el or ell? */ 687 c=*locale++; 688 if(is_l(c)) { 689 c=*locale++; 690 if(is_l(c)) { 691 c=*locale; 692 } 693 if(is_sep(c)) { 694 return UCASE_LOC_GREEK; 695 } 696 } 697 } else if(c=='N') { 698 /* nl or nld? */ 699 c=*locale++; 700 if(is_l(c)) { 701 c=*locale++; 702 if(is_d(c)) { 703 c=*locale; 704 } 705 if(is_sep(c)) { 706 return UCASE_LOC_DUTCH; 707 } 708 } 709 } 710 } 711 return UCASE_LOC_ROOT; 712 } 713 714 /* 715 * Is followed by 716 * {case-ignorable}* cased 717 * ? 718 * (dir determines looking forward/backward) 719 * If a character is case-ignorable, it is skipped regardless of whether 720 * it is also cased or not. 721 */ 722 static UBool 723 isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) { 724 UChar32 c; 725 726 if(iter==NULL) { 727 return FALSE; 728 } 729 730 for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) { 731 int32_t type=ucase_getTypeOrIgnorable(c); 732 if(type&4) { 733 /* case-ignorable, continue with the loop */ 734 } else if(type!=UCASE_NONE) { 735 return TRUE; /* followed by cased letter */ 736 } else { 737 return FALSE; /* uncased and not case-ignorable */ 738 } 739 } 740 741 return FALSE; /* not followed by cased letter */ 742 } 743 744 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */ 745 static UBool 746 isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) { 747 UChar32 c; 748 int32_t dotType; 749 int8_t dir; 750 751 if(iter==NULL) { 752 return FALSE; 753 } 754 755 for(dir=-1; (c=iter(context, dir))>=0; dir=0) { 756 dotType=getDotType(c); 757 if(dotType==UCASE_SOFT_DOTTED) { 758 return TRUE; /* preceded by TYPE_i */ 759 } else if(dotType!=UCASE_OTHER_ACCENT) { 760 return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */ 761 } 762 } 763 764 return FALSE; /* not preceded by TYPE_i */ 765 } 766 767 /* 768 * See Jitterbug 2344: 769 * The condition After_I for Turkic-lowercasing of U+0307 combining dot above 770 * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because 771 * we made those releases compatible with Unicode 3.2 which had not fixed 772 * a related bug in SpecialCasing.txt. 773 * 774 * From the Jitterbug 2344 text: 775 * ... this bug is listed as a Unicode erratum 776 * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html 777 * <quote> 778 * There are two errors in SpecialCasing.txt. 779 * 1. Missing semicolons on two lines. ... [irrelevant for ICU] 780 * 2. An incorrect context definition. Correct as follows: 781 * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE 782 * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE 783 * --- 784 * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 785 * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE 786 * where the context After_I is defined as: 787 * The last preceding base character was an uppercase I, and there is no 788 * intervening combining character class 230 (ABOVE). 789 * </quote> 790 * 791 * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as: 792 * 793 * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 794 * # This matches the behavior of the canonically equivalent I-dot_above 795 * 796 * See also the description in this place in older versions of uchar.c (revision 1.100). 797 * 798 * Markus W. Scherer 2003-feb-15 799 */ 800 801 /* Is preceded by base character 'I' with no intervening cc=230 ? */ 802 static UBool 803 isPrecededBy_I(UCaseContextIterator *iter, void *context) { 804 UChar32 c; 805 int32_t dotType; 806 int8_t dir; 807 808 if(iter==NULL) { 809 return FALSE; 810 } 811 812 for(dir=-1; (c=iter(context, dir))>=0; dir=0) { 813 if(c==0x49) { 814 return TRUE; /* preceded by I */ 815 } 816 dotType=getDotType(c); 817 if(dotType!=UCASE_OTHER_ACCENT) { 818 return FALSE; /* preceded by different base character (not I), or intervening cc==230 */ 819 } 820 } 821 822 return FALSE; /* not preceded by I */ 823 } 824 825 /* Is followed by one or more cc==230 ? */ 826 static UBool 827 isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) { 828 UChar32 c; 829 int32_t dotType; 830 int8_t dir; 831 832 if(iter==NULL) { 833 return FALSE; 834 } 835 836 for(dir=1; (c=iter(context, dir))>=0; dir=0) { 837 dotType=getDotType(c); 838 if(dotType==UCASE_ABOVE) { 839 return TRUE; /* at least one cc==230 following */ 840 } else if(dotType!=UCASE_OTHER_ACCENT) { 841 return FALSE; /* next base character, no more cc==230 following */ 842 } 843 } 844 845 return FALSE; /* no more cc==230 following */ 846 } 847 848 /* Is followed by a dot above (without cc==230 in between) ? */ 849 static UBool 850 isFollowedByDotAbove(UCaseContextIterator *iter, void *context) { 851 UChar32 c; 852 int32_t dotType; 853 int8_t dir; 854 855 if(iter==NULL) { 856 return FALSE; 857 } 858 859 for(dir=1; (c=iter(context, dir))>=0; dir=0) { 860 if(c==0x307) { 861 return TRUE; 862 } 863 dotType=getDotType(c); 864 if(dotType!=UCASE_OTHER_ACCENT) { 865 return FALSE; /* next base character or cc==230 in between */ 866 } 867 } 868 869 return FALSE; /* no dot above following */ 870 } 871 872 U_CAPI int32_t U_EXPORT2 873 ucase_toFullLower(UChar32 c, 874 UCaseContextIterator *iter, void *context, 875 const UChar **pString, 876 int32_t loc) { 877 // The sign of the result has meaning, input must be non-negative so that it can be returned as is. 878 U_ASSERT(c >= 0); 879 UChar32 result=c; 880 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 881 if(!PROPS_HAS_EXCEPTION(props)) { 882 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { 883 result=c+UCASE_GET_DELTA(props); 884 } 885 } else { 886 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2; 887 uint16_t excWord=*pe++; 888 int32_t full; 889 890 pe2=pe; 891 892 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { 893 /* use hardcoded conditions and mappings */ 894 895 /* 896 * Test for conditional mappings first 897 * (otherwise the unconditional default mappings are always taken), 898 * then test for characters that have unconditional mappings in SpecialCasing.txt, 899 * then get the UnicodeData.txt mappings. 900 */ 901 if( loc==UCASE_LOC_LITHUANIAN && 902 /* base characters, find accents above */ 903 (((c==0x49 || c==0x4a || c==0x12e) && 904 isFollowedByMoreAbove(iter, context)) || 905 /* precomposed with accent above, no need to find one */ 906 (c==0xcc || c==0xcd || c==0x128)) 907 ) { 908 /* 909 # Lithuanian 910 911 # Lithuanian retains the dot in a lowercase i when followed by accents. 912 913 # Introduce an explicit dot above when lowercasing capital I's and J's 914 # whenever there are more accents above. 915 # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) 916 917 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I 918 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J 919 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK 920 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE 921 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE 922 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE 923 */ 924 switch(c) { 925 case 0x49: /* LATIN CAPITAL LETTER I */ 926 *pString=iDot; 927 return 2; 928 case 0x4a: /* LATIN CAPITAL LETTER J */ 929 *pString=jDot; 930 return 2; 931 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */ 932 *pString=iOgonekDot; 933 return 2; 934 case 0xcc: /* LATIN CAPITAL LETTER I WITH GRAVE */ 935 *pString=iDotGrave; 936 return 3; 937 case 0xcd: /* LATIN CAPITAL LETTER I WITH ACUTE */ 938 *pString=iDotAcute; 939 return 3; 940 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */ 941 *pString=iDotTilde; 942 return 3; 943 default: 944 return 0; /* will not occur */ 945 } 946 /* # Turkish and Azeri */ 947 } else if(loc==UCASE_LOC_TURKISH && c==0x130) { 948 /* 949 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 950 # The following rules handle those cases. 951 952 0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE 953 0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE 954 */ 955 return 0x69; 956 } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) { 957 /* 958 # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. 959 # This matches the behavior of the canonically equivalent I-dot_above 960 961 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE 962 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE 963 */ 964 *pString=nullptr; 965 return 0; /* remove the dot (continue without output) */ 966 } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) { 967 /* 968 # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. 969 970 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I 971 0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I 972 */ 973 return 0x131; 974 } else if(c==0x130) { 975 /* 976 # Preserve canonical equivalence for I with dot. Turkic is handled below. 977 978 0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE 979 */ 980 *pString=iDot; 981 return 2; 982 } else if( c==0x3a3 && 983 !isFollowedByCasedLetter(iter, context, 1) && 984 isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */ 985 ) { 986 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */ 987 /* 988 # Special case for final form of sigma 989 990 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA 991 */ 992 return 0x3c2; /* greek small final sigma */ 993 } else { 994 /* no known conditional special case mapping, use a normal mapping */ 995 } 996 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 997 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); 998 full&=UCASE_FULL_LOWER; 999 if(full!=0) { 1000 /* set the output pointer to the lowercase mapping */ 1001 *pString=reinterpret_cast<const UChar *>(pe+1); 1002 1003 /* return the string length */ 1004 return full; 1005 } 1006 } 1007 1008 if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 1009 GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result); 1010 } 1011 } 1012 1013 return (result==c) ? ~result : result; 1014 } 1015 1016 /* internal */ 1017 static int32_t 1018 toUpperOrTitle(UChar32 c, 1019 UCaseContextIterator *iter, void *context, 1020 const UChar **pString, 1021 int32_t loc, 1022 UBool upperNotTitle) { 1023 // The sign of the result has meaning, input must be non-negative so that it can be returned as is. 1024 U_ASSERT(c >= 0); 1025 UChar32 result=c; 1026 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 1027 if(!PROPS_HAS_EXCEPTION(props)) { 1028 if(UCASE_GET_TYPE(props)==UCASE_LOWER) { 1029 result=c+UCASE_GET_DELTA(props); 1030 } 1031 } else { 1032 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2; 1033 uint16_t excWord=*pe++; 1034 int32_t full, idx; 1035 1036 pe2=pe; 1037 1038 if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) { 1039 /* use hardcoded conditions and mappings */ 1040 if(loc==UCASE_LOC_TURKISH && c==0x69) { 1041 /* 1042 # Turkish and Azeri 1043 1044 # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri 1045 # The following rules handle those cases. 1046 1047 # When uppercasing, i turns into a dotted capital I 1048 1049 0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I 1050 0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I 1051 */ 1052 return 0x130; 1053 } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) { 1054 /* 1055 # Lithuanian 1056 1057 # Lithuanian retains the dot in a lowercase i when followed by accents. 1058 1059 # Remove DOT ABOVE after "i" with upper or titlecase 1060 1061 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE 1062 */ 1063 *pString=nullptr; 1064 return 0; /* remove the dot (continue without output) */ 1065 } else { 1066 /* no known conditional special case mapping, use a normal mapping */ 1067 } 1068 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 1069 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); 1070 1071 /* start of full case mapping strings */ 1072 ++pe; 1073 1074 /* skip the lowercase and case-folding result strings */ 1075 pe+=full&UCASE_FULL_LOWER; 1076 full>>=4; 1077 pe+=full&0xf; 1078 full>>=4; 1079 1080 if(upperNotTitle) { 1081 full&=0xf; 1082 } else { 1083 /* skip the uppercase result string */ 1084 pe+=full&0xf; 1085 full=(full>>4)&0xf; 1086 } 1087 1088 if(full!=0) { 1089 /* set the output pointer to the result string */ 1090 *pString=reinterpret_cast<const UChar *>(pe); 1091 1092 /* return the string length */ 1093 return full; 1094 } 1095 } 1096 1097 if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) { 1098 idx=UCASE_EXC_TITLE; 1099 } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) { 1100 /* here, titlecase is same as uppercase */ 1101 idx=UCASE_EXC_UPPER; 1102 } else { 1103 return ~c; 1104 } 1105 GET_SLOT_VALUE(excWord, idx, pe2, result); 1106 } 1107 1108 return (result==c) ? ~result : result; 1109 } 1110 1111 U_CAPI int32_t U_EXPORT2 1112 ucase_toFullUpper(UChar32 c, 1113 UCaseContextIterator *iter, void *context, 1114 const UChar **pString, 1115 int32_t caseLocale) { 1116 return toUpperOrTitle(c, iter, context, pString, caseLocale, TRUE); 1117 } 1118 1119 U_CAPI int32_t U_EXPORT2 1120 ucase_toFullTitle(UChar32 c, 1121 UCaseContextIterator *iter, void *context, 1122 const UChar **pString, 1123 int32_t caseLocale) { 1124 return toUpperOrTitle(c, iter, context, pString, caseLocale, FALSE); 1125 } 1126 1127 /* case folding ------------------------------------------------------------- */ 1128 1129 /* 1130 * Case folding is similar to lowercasing. 1131 * The result may be a simple mapping, i.e., a single code point, or 1132 * a full mapping, i.e., a string. 1133 * If the case folding for a code point is the same as its simple (1:1) lowercase mapping, 1134 * then only the lowercase mapping is stored. 1135 * 1136 * Some special cases are hardcoded because their conditions cannot be 1137 * parsed and processed from CaseFolding.txt. 1138 * 1139 * Unicode 3.2 CaseFolding.txt specifies for its status field: 1140 1141 # C: common case folding, common mappings shared by both simple and full mappings. 1142 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces. 1143 # S: simple case folding, mappings to single characters where different from F. 1144 # T: special case for uppercase I and dotted uppercase I 1145 # - For non-Turkic languages, this mapping is normally not used. 1146 # - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters. 1147 # 1148 # Usage: 1149 # A. To do a simple case folding, use the mappings with status C + S. 1150 # B. To do a full case folding, use the mappings with status C + F. 1151 # 1152 # The mappings with status T can be used or omitted depending on the desired case-folding 1153 # behavior. (The default option is to exclude them.) 1154 1155 * Unicode 3.2 has 'T' mappings as follows: 1156 1157 0049; T; 0131; # LATIN CAPITAL LETTER I 1158 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE 1159 1160 * while the default mappings for these code points are: 1161 1162 0049; C; 0069; # LATIN CAPITAL LETTER I 1163 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE 1164 1165 * U+0130 has no simple case folding (simple-case-folds to itself). 1166 */ 1167 1168 /* return the simple case folding mapping for c */ 1169 U_CAPI UChar32 U_EXPORT2 1170 ucase_fold(UChar32 c, uint32_t options) { 1171 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 1172 if(!PROPS_HAS_EXCEPTION(props)) { 1173 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { 1174 c+=UCASE_GET_DELTA(props); 1175 } 1176 } else { 1177 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props); 1178 uint16_t excWord=*pe++; 1179 int32_t idx; 1180 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { 1181 /* special case folding mappings, hardcoded */ 1182 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { 1183 /* default mappings */ 1184 if(c==0x49) { 1185 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ 1186 return 0x69; 1187 } else if(c==0x130) { 1188 /* no simple case folding for U+0130 */ 1189 return c; 1190 } 1191 } else { 1192 /* Turkic mappings */ 1193 if(c==0x49) { 1194 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ 1195 return 0x131; 1196 } else if(c==0x130) { 1197 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1198 return 0x69; 1199 } 1200 } 1201 } 1202 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { 1203 idx=UCASE_EXC_FOLD; 1204 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 1205 idx=UCASE_EXC_LOWER; 1206 } else { 1207 return c; 1208 } 1209 GET_SLOT_VALUE(excWord, idx, pe, c); 1210 } 1211 return c; 1212 } 1213 1214 /* 1215 * Issue for canonical caseless match (UAX #21): 1216 * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve 1217 * canonical equivalence, unlike default-option casefolding. 1218 * For example, I-grave and I + grave fold to strings that are not canonically 1219 * equivalent. 1220 * For more details, see the comment in unorm_compare() in unorm.cpp 1221 * and the intermediate prototype changes for Jitterbug 2021. 1222 * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.) 1223 * 1224 * This did not get fixed because it appears that it is not possible to fix 1225 * it for uppercase and lowercase characters (I-grave vs. i-grave) 1226 * together in a way that they still fold to common result strings. 1227 */ 1228 1229 U_CAPI int32_t U_EXPORT2 1230 ucase_toFullFolding(UChar32 c, 1231 const UChar **pString, 1232 uint32_t options) { 1233 // The sign of the result has meaning, input must be non-negative so that it can be returned as is. 1234 U_ASSERT(c >= 0); 1235 UChar32 result=c; 1236 uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c); 1237 if(!PROPS_HAS_EXCEPTION(props)) { 1238 if(UCASE_GET_TYPE(props)>=UCASE_UPPER) { 1239 result=c+UCASE_GET_DELTA(props); 1240 } 1241 } else { 1242 const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2; 1243 uint16_t excWord=*pe++; 1244 int32_t full, idx; 1245 1246 pe2=pe; 1247 1248 if(excWord&UCASE_EXC_CONDITIONAL_FOLD) { 1249 /* use hardcoded conditions and mappings */ 1250 if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) { 1251 /* default mappings */ 1252 if(c==0x49) { 1253 /* 0049; C; 0069; # LATIN CAPITAL LETTER I */ 1254 return 0x69; 1255 } else if(c==0x130) { 1256 /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1257 *pString=iDot; 1258 return 2; 1259 } 1260 } else { 1261 /* Turkic mappings */ 1262 if(c==0x49) { 1263 /* 0049; T; 0131; # LATIN CAPITAL LETTER I */ 1264 return 0x131; 1265 } else if(c==0x130) { 1266 /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */ 1267 return 0x69; 1268 } 1269 } 1270 } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) { 1271 GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full); 1272 1273 /* start of full case mapping strings */ 1274 ++pe; 1275 1276 /* skip the lowercase result string */ 1277 pe+=full&UCASE_FULL_LOWER; 1278 full=(full>>4)&0xf; 1279 1280 if(full!=0) { 1281 /* set the output pointer to the result string */ 1282 *pString=reinterpret_cast<const UChar *>(pe); 1283 1284 /* return the string length */ 1285 return full; 1286 } 1287 } 1288 1289 if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) { 1290 idx=UCASE_EXC_FOLD; 1291 } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) { 1292 idx=UCASE_EXC_LOWER; 1293 } else { 1294 return ~c; 1295 } 1296 GET_SLOT_VALUE(excWord, idx, pe2, result); 1297 } 1298 1299 return (result==c) ? ~result : result; 1300 } 1301 1302 /* case mapping properties API ---------------------------------------------- */ 1303 1304 /* public API (see uchar.h) */ 1305 1306 U_CAPI UBool U_EXPORT2 1307 u_isULowercase(UChar32 c) { 1308 return (UBool)(UCASE_LOWER==ucase_getType(c)); 1309 } 1310 1311 U_CAPI UBool U_EXPORT2 1312 u_isUUppercase(UChar32 c) { 1313 return (UBool)(UCASE_UPPER==ucase_getType(c)); 1314 } 1315 1316 /* Transforms the Unicode character to its lower case equivalent.*/ 1317 U_CAPI UChar32 U_EXPORT2 1318 u_tolower(UChar32 c) { 1319 return ucase_tolower(c); 1320 } 1321 1322 /* Transforms the Unicode character to its upper case equivalent.*/ 1323 U_CAPI UChar32 U_EXPORT2 1324 u_toupper(UChar32 c) { 1325 return ucase_toupper(c); 1326 } 1327 1328 /* Transforms the Unicode character to its title case equivalent.*/ 1329 U_CAPI UChar32 U_EXPORT2 1330 u_totitle(UChar32 c) { 1331 return ucase_totitle(c); 1332 } 1333 1334 /* return the simple case folding mapping for c */ 1335 U_CAPI UChar32 U_EXPORT2 1336 u_foldCase(UChar32 c, uint32_t options) { 1337 return ucase_fold(c, options); 1338 } 1339 1340 U_CFUNC int32_t U_EXPORT2 1341 ucase_hasBinaryProperty(UChar32 c, UProperty which) { 1342 /* case mapping properties */ 1343 const UChar *resultString; 1344 switch(which) { 1345 case UCHAR_LOWERCASE: 1346 return (UBool)(UCASE_LOWER==ucase_getType(c)); 1347 case UCHAR_UPPERCASE: 1348 return (UBool)(UCASE_UPPER==ucase_getType(c)); 1349 case UCHAR_SOFT_DOTTED: 1350 return ucase_isSoftDotted(c); 1351 case UCHAR_CASE_SENSITIVE: 1352 return ucase_isCaseSensitive(c); 1353 case UCHAR_CASED: 1354 return (UBool)(UCASE_NONE!=ucase_getType(c)); 1355 case UCHAR_CASE_IGNORABLE: 1356 return (UBool)(ucase_getTypeOrIgnorable(c)>>2); 1357 /* 1358 * Note: The following Changes_When_Xyz are defined as testing whether 1359 * the NFD form of the input changes when Xyz-case-mapped. 1360 * However, this simpler implementation of these properties, 1361 * ignoring NFD, passes the tests. 1362 * The implementation needs to be changed if the tests start failing. 1363 * When that happens, optimizations should be used to work with the 1364 * per-single-code point ucase_toFullXyz() functions unless 1365 * the NFD form has more than one code point, 1366 * and the property starts set needs to be the union of the 1367 * start sets for normalization and case mappings. 1368 */ 1369 case UCHAR_CHANGES_WHEN_LOWERCASED: 1370 return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0); 1371 case UCHAR_CHANGES_WHEN_UPPERCASED: 1372 return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0); 1373 case UCHAR_CHANGES_WHEN_TITLECASED: 1374 return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0); 1375 /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */ 1376 case UCHAR_CHANGES_WHEN_CASEMAPPED: 1377 return (UBool)( 1378 ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 || 1379 ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 || 1380 ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0); 1381 default: 1382 return FALSE; 1383 } 1384 } 1385