Home | History | Annotate | Download | only in common
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 *******************************************************************************
      5 *
      6 *   Copyright (C) 2004-2014, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 *******************************************************************************
     10 *   file name:  ucase.cpp
     11 *   encoding:   UTF-8
     12 *   tab size:   8 (not used)
     13 *   indentation:4
     14 *
     15 *   created on: 2004aug30
     16 *   created by: Markus W. Scherer
     17 *
     18 *   Low-level Unicode character/string case mapping code.
     19 *   Much code moved here (and modified) from uchar.c.
     20 */
     21 
     22 #include "unicode/utypes.h"
     23 #include "unicode/unistr.h"
     24 #include "unicode/uset.h"
     25 #include "unicode/udata.h" /* UDataInfo */
     26 #include "unicode/utf16.h"
     27 #include "ucmndata.h" /* DataHeader */
     28 #include "udatamem.h"
     29 #include "umutex.h"
     30 #include "uassert.h"
     31 #include "cmemory.h"
     32 #include "utrie2.h"
     33 #include "ucase.h"
     34 
     35 struct UCaseProps {
     36     UDataMemory *mem;
     37     const int32_t *indexes;
     38     const uint16_t *exceptions;
     39     const uint16_t *unfold;
     40 
     41     UTrie2 trie;
     42     uint8_t formatVersion[4];
     43 };
     44 
     45 /* ucase_props_data.h is machine-generated by gencase --csource */
     46 #define INCLUDED_FROM_UCASE_CPP
     47 #include "ucase_props_data.h"
     48 
     49 /* set of property starts for UnicodeSet ------------------------------------ */
     50 
     51 static UBool U_CALLCONV
     52 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
     53     /* add the start code point to the USet */
     54     const USetAdder *sa=(const USetAdder *)context;
     55     sa->add(sa->set, start);
     56     return TRUE;
     57 }
     58 
     59 U_CFUNC void U_EXPORT2
     60 ucase_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode) {
     61     if(U_FAILURE(*pErrorCode)) {
     62         return;
     63     }
     64 
     65     /* add the start code point of each same-value range of the trie */
     66     utrie2_enum(&ucase_props_singleton.trie, NULL, _enumPropertyStartsRange, sa);
     67 
     68     /* add code points with hardcoded properties, plus the ones following them */
     69 
     70     /* (none right now, see comment below) */
     71 
     72     /*
     73      * Omit code points with hardcoded specialcasing properties
     74      * because we do not build property UnicodeSets for them right now.
     75      */
     76 }
     77 
     78 /* data access primitives --------------------------------------------------- */
     79 
     80 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
     81 
     82 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
     83 
     84 /* number of bits in an 8-bit integer value */
     85 static const uint8_t flagsOffset[256]={
     86     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
     87     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     88     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     89     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     90     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     91     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     92     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     93     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     94     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     95     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     96     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     97     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     98     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     99     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    100     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    101     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
    102 };
    103 
    104 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
    105 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
    106 
    107 /*
    108  * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
    109  *
    110  * @param excWord (in) initial exceptions word
    111  * @param idx (in) desired slot index
    112  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
    113  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
    114  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
    115  */
    116 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
    117     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
    118         (pExc16)+=SLOT_OFFSET(excWord, idx); \
    119         (value)=*pExc16; \
    120     } else { \
    121         (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
    122         (value)=*pExc16++; \
    123         (value)=((value)<<16)|*pExc16; \
    124     }
    125 
    126 /* simple case mappings ----------------------------------------------------- */
    127 
    128 U_CAPI UChar32 U_EXPORT2
    129 ucase_tolower(UChar32 c) {
    130     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    131     if(!PROPS_HAS_EXCEPTION(props)) {
    132         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
    133             c+=UCASE_GET_DELTA(props);
    134         }
    135     } else {
    136         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
    137         uint16_t excWord=*pe++;
    138         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
    139             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
    140         }
    141     }
    142     return c;
    143 }
    144 
    145 U_CAPI UChar32 U_EXPORT2
    146 ucase_toupper(UChar32 c) {
    147     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    148     if(!PROPS_HAS_EXCEPTION(props)) {
    149         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
    150             c+=UCASE_GET_DELTA(props);
    151         }
    152     } else {
    153         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
    154         uint16_t excWord=*pe++;
    155         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
    156             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
    157         }
    158     }
    159     return c;
    160 }
    161 
    162 U_CAPI UChar32 U_EXPORT2
    163 ucase_totitle(UChar32 c) {
    164     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    165     if(!PROPS_HAS_EXCEPTION(props)) {
    166         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
    167             c+=UCASE_GET_DELTA(props);
    168         }
    169     } else {
    170         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
    171         uint16_t excWord=*pe++;
    172         int32_t idx;
    173         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
    174             idx=UCASE_EXC_TITLE;
    175         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
    176             idx=UCASE_EXC_UPPER;
    177         } else {
    178             return c;
    179         }
    180         GET_SLOT_VALUE(excWord, idx, pe, c);
    181     }
    182     return c;
    183 }
    184 
    185 static const UChar iDot[2] = { 0x69, 0x307 };
    186 static const UChar jDot[2] = { 0x6a, 0x307 };
    187 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
    188 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
    189 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
    190 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
    191 
    192 
    193 U_CFUNC void U_EXPORT2
    194 ucase_addCaseClosure(UChar32 c, const USetAdder *sa) {
    195     uint16_t props;
    196 
    197     /*
    198      * Hardcode the case closure of i and its relatives and ignore the
    199      * data file data for these characters.
    200      * The Turkic dotless i and dotted I with their case mapping conditions
    201      * and case folding option make the related characters behave specially.
    202      * This code matches their closure behavior to their case folding behavior.
    203      */
    204 
    205     switch(c) {
    206     case 0x49:
    207         /* regular i and I are in one equivalence class */
    208         sa->add(sa->set, 0x69);
    209         return;
    210     case 0x69:
    211         sa->add(sa->set, 0x49);
    212         return;
    213     case 0x130:
    214         /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
    215         sa->addString(sa->set, iDot, 2);
    216         return;
    217     case 0x131:
    218         /* dotless i is in a class by itself */
    219         return;
    220     default:
    221         /* otherwise use the data file data */
    222         break;
    223     }
    224 
    225     props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    226     if(!PROPS_HAS_EXCEPTION(props)) {
    227         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
    228             /* add the one simple case mapping, no matter what type it is */
    229             int32_t delta=UCASE_GET_DELTA(props);
    230             if(delta!=0) {
    231                 sa->add(sa->set, c+delta);
    232             }
    233         }
    234     } else {
    235         /*
    236          * c has exceptions, so there may be multiple simple and/or
    237          * full case mappings. Add them all.
    238          */
    239         const uint16_t *pe0, *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
    240         const UChar *closure;
    241         uint16_t excWord=*pe++;
    242         int32_t idx, closureLength, fullLength, length;
    243 
    244         pe0=pe;
    245 
    246         /* add all simple case mappings */
    247         for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
    248             if(HAS_SLOT(excWord, idx)) {
    249                 pe=pe0;
    250                 GET_SLOT_VALUE(excWord, idx, pe, c);
    251                 sa->add(sa->set, c);
    252             }
    253         }
    254 
    255         /* get the closure string pointer & length */
    256         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
    257             pe=pe0;
    258             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
    259             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
    260             closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
    261         } else {
    262             closureLength=0;
    263             closure=NULL;
    264         }
    265 
    266         /* add the full case folding */
    267         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
    268             pe=pe0;
    269             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
    270 
    271             /* start of full case mapping strings */
    272             ++pe;
    273 
    274             fullLength&=0xffff; /* bits 16 and higher are reserved */
    275 
    276             /* skip the lowercase result string */
    277             pe+=fullLength&UCASE_FULL_LOWER;
    278             fullLength>>=4;
    279 
    280             /* add the full case folding string */
    281             length=fullLength&0xf;
    282             if(length!=0) {
    283                 sa->addString(sa->set, (const UChar *)pe, length);
    284                 pe+=length;
    285             }
    286 
    287             /* skip the uppercase and titlecase strings */
    288             fullLength>>=4;
    289             pe+=fullLength&0xf;
    290             fullLength>>=4;
    291             pe+=fullLength;
    292 
    293             closure=(const UChar *)pe; /* behind full case mappings */
    294         }
    295 
    296         /* add each code point in the closure string */
    297         for(idx=0; idx<closureLength;) {
    298             U16_NEXT_UNSAFE(closure, idx, c);
    299             sa->add(sa->set, c);
    300         }
    301     }
    302 }
    303 
    304 /*
    305  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
    306  * must be length>0 and max>0 and length<=max
    307  */
    308 static inline int32_t
    309 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
    310     int32_t c1, c2;
    311 
    312     max-=length; /* we require length<=max, so no need to decrement max in the loop */
    313     do {
    314         c1=*s++;
    315         c2=*t++;
    316         if(c2==0) {
    317             return 1; /* reached the end of t but not of s */
    318         }
    319         c1-=c2;
    320         if(c1!=0) {
    321             return c1; /* return difference result */
    322         }
    323     } while(--length>0);
    324     /* ends with length==0 */
    325 
    326     if(max==0 || *t==0) {
    327         return 0; /* equal to length of both strings */
    328     } else {
    329         return -max; /* return lengh difference */
    330     }
    331 }
    332 
    333 U_CFUNC UBool U_EXPORT2
    334 ucase_addStringCaseClosure(const UChar *s, int32_t length, const USetAdder *sa) {
    335     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
    336 
    337     if(ucase_props_singleton.unfold==NULL || s==NULL) {
    338         return FALSE; /* no reverse case folding data, or no string */
    339     }
    340     if(length<=1) {
    341         /* the string is too short to find any match */
    342         /*
    343          * more precise would be:
    344          * if(!u_strHasMoreChar32Than(s, length, 1))
    345          * but this does not make much practical difference because
    346          * a single supplementary code point would just not be found
    347          */
    348         return FALSE;
    349     }
    350 
    351     const uint16_t *unfold=ucase_props_singleton.unfold;
    352     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
    353     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
    354     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
    355     unfold+=unfoldRowWidth;
    356 
    357     if(length>unfoldStringWidth) {
    358         /* the string is too long to find any match */
    359         return FALSE;
    360     }
    361 
    362     /* do a binary search for the string */
    363     start=0;
    364     limit=unfoldRows;
    365     while(start<limit) {
    366         i=(start+limit)/2;
    367         const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
    368         result=strcmpMax(s, length, p, unfoldStringWidth);
    369 
    370         if(result==0) {
    371             /* found the string: add each code point, and its case closure */
    372             UChar32 c;
    373 
    374             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
    375                 U16_NEXT_UNSAFE(p, i, c);
    376                 sa->add(sa->set, c);
    377                 ucase_addCaseClosure(c, sa);
    378             }
    379             return TRUE;
    380         } else if(result<0) {
    381             limit=i;
    382         } else /* result>0 */ {
    383             start=i+1;
    384         }
    385     }
    386 
    387     return FALSE; /* string not found */
    388 }
    389 
    390 U_NAMESPACE_BEGIN
    391 
    392 FullCaseFoldingIterator::FullCaseFoldingIterator()
    393         : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
    394           unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
    395           unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
    396           unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
    397           currentRow(0),
    398           rowCpIndex(unfoldStringWidth) {
    399     unfold+=unfoldRowWidth;
    400 }
    401 
    402 UChar32
    403 FullCaseFoldingIterator::next(UnicodeString &full) {
    404     // Advance past the last-delivered code point.
    405     const UChar *p=unfold+(currentRow*unfoldRowWidth);
    406     if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
    407         ++currentRow;
    408         p+=unfoldRowWidth;
    409         rowCpIndex=unfoldStringWidth;
    410     }
    411     if(currentRow>=unfoldRows) { return U_SENTINEL; }
    412     // Set "full" to the NUL-terminated string in the first unfold column.
    413     int32_t length=unfoldStringWidth;
    414     while(length>0 && p[length-1]==0) { --length; }
    415     full.setTo(FALSE, p, length);
    416     // Return the code point.
    417     UChar32 c;
    418     U16_NEXT_UNSAFE(p, rowCpIndex, c);
    419     return c;
    420 }
    421 
    422 U_NAMESPACE_END
    423 
    424 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
    425 U_CAPI int32_t U_EXPORT2
    426 ucase_getType(UChar32 c) {
    427     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    428     return UCASE_GET_TYPE(props);
    429 }
    430 
    431 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
    432 U_CAPI int32_t U_EXPORT2
    433 ucase_getTypeOrIgnorable(UChar32 c) {
    434     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    435     return UCASE_GET_TYPE_AND_IGNORABLE(props);
    436 }
    437 
    438 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
    439 static inline int32_t
    440 getDotType(UChar32 c) {
    441     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    442     if(!PROPS_HAS_EXCEPTION(props)) {
    443         return props&UCASE_DOT_MASK;
    444     } else {
    445         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
    446         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
    447     }
    448 }
    449 
    450 U_CAPI UBool U_EXPORT2
    451 ucase_isSoftDotted(UChar32 c) {
    452     return (UBool)(getDotType(c)==UCASE_SOFT_DOTTED);
    453 }
    454 
    455 U_CAPI UBool U_EXPORT2
    456 ucase_isCaseSensitive(UChar32 c) {
    457     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    458     return (UBool)((props&UCASE_SENSITIVE)!=0);
    459 }
    460 
    461 /* string casing ------------------------------------------------------------ */
    462 
    463 /*
    464  * These internal functions form the core of string case mappings.
    465  * They map single code points to result code points or strings and take
    466  * all necessary conditions (context, locale ID, options) into account.
    467  *
    468  * They do not iterate over the source or write to the destination
    469  * so that the same functions are useful for non-standard string storage,
    470  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
    471  * For the same reason, the "surrounding text" context is passed in as a
    472  * UCaseContextIterator which does not make any assumptions about
    473  * the underlying storage.
    474  *
    475  * This section contains helper functions that check for conditions
    476  * in the input text surrounding the current code point
    477  * according to SpecialCasing.txt.
    478  *
    479  * Each helper function gets the index
    480  * - after the current code point if it looks at following text
    481  * - before the current code point if it looks at preceding text
    482  *
    483  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
    484  *
    485  * Final_Sigma
    486  *   C is preceded by a sequence consisting of
    487  *     a cased letter and a case-ignorable sequence,
    488  *   and C is not followed by a sequence consisting of
    489  *     an ignorable sequence and then a cased letter.
    490  *
    491  * More_Above
    492  *   C is followed by one or more characters of combining class 230 (ABOVE)
    493  *   in the combining character sequence.
    494  *
    495  * After_Soft_Dotted
    496  *   The last preceding character with combining class of zero before C
    497  *   was Soft_Dotted,
    498  *   and there is no intervening combining character class 230 (ABOVE).
    499  *
    500  * Before_Dot
    501  *   C is followed by combining dot above (U+0307).
    502  *   Any sequence of characters with a combining class that is neither 0 nor 230
    503  *   may intervene between the current character and the combining dot above.
    504  *
    505  * The erratum from 2002-10-31 adds the condition
    506  *
    507  * After_I
    508  *   The last preceding base character was an uppercase I, and there is no
    509  *   intervening combining character class 230 (ABOVE).
    510  *
    511  *   (See Jitterbug 2344 and the comments on After_I below.)
    512  *
    513  * Helper definitions in Unicode 3.2 UAX 21:
    514  *
    515  * D1. A character C is defined to be cased
    516  *     if it meets any of the following criteria:
    517  *
    518  *   - The general category of C is Titlecase Letter (Lt)
    519  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
    520  *   - Given D = NFD(C), then it is not the case that:
    521  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
    522  *     (This third criterium does not add any characters to the list
    523  *      for Unicode 3.2. Ignored.)
    524  *
    525  * D2. A character C is defined to be case-ignorable
    526  *     if it meets either of the following criteria:
    527  *
    528  *   - The general category of C is
    529  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
    530  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
    531  *   - C is one of the following characters
    532  *     U+0027 APOSTROPHE
    533  *     U+00AD SOFT HYPHEN (SHY)
    534  *     U+2019 RIGHT SINGLE QUOTATION MARK
    535  *            (the preferred character for apostrophe)
    536  *
    537  * D3. A case-ignorable sequence is a sequence of
    538  *     zero or more case-ignorable characters.
    539  */
    540 
    541 #define is_d(c) ((c)=='d' || (c)=='D')
    542 #define is_e(c) ((c)=='e' || (c)=='E')
    543 #define is_i(c) ((c)=='i' || (c)=='I')
    544 #define is_l(c) ((c)=='l' || (c)=='L')
    545 #define is_r(c) ((c)=='r' || (c)=='R')
    546 #define is_t(c) ((c)=='t' || (c)=='T')
    547 #define is_u(c) ((c)=='u' || (c)=='U')
    548 #define is_z(c) ((c)=='z' || (c)=='Z')
    549 
    550 /* separator? */
    551 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
    552 
    553 /**
    554  * Requires non-NULL locale ID but otherwise does the equivalent of
    555  * checking for language codes as if uloc_getLanguage() were called:
    556  * Accepts both 2- and 3-letter codes and accepts case variants.
    557  */
    558 U_CFUNC int32_t
    559 ucase_getCaseLocale(const char *locale) {
    560     /*
    561      * This function used to use uloc_getLanguage(), but the current code
    562      * removes the dependency of this low-level code on uloc implementation code
    563      * and is faster because not the whole locale ID has to be
    564      * examined and copied/transformed.
    565      *
    566      * Because this code does not want to depend on uloc, the caller must
    567      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
    568      */
    569     char c=*locale++;
    570     // Fastpath for English "en" which is often used for default (=root locale) case mappings,
    571     // and for Chinese "zh": Very common but no special case mapping behavior.
    572     // Then check lowercase vs. uppercase to reduce the number of comparisons
    573     // for other locales without special behavior.
    574     if(c=='e') {
    575         /* el or ell? */
    576         c=*locale++;
    577         if(is_l(c)) {
    578             c=*locale++;
    579             if(is_l(c)) {
    580                 c=*locale;
    581             }
    582             if(is_sep(c)) {
    583                 return UCASE_LOC_GREEK;
    584             }
    585         }
    586         // en, es, ... -> root
    587     } else if(c=='z') {
    588         return UCASE_LOC_ROOT;
    589 #if U_CHARSET_FAMILY==U_ASCII_FAMILY
    590     } else if(c>='a') {  // ASCII a-z = 0x61..0x7a, after A-Z
    591 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY
    592     } else if(c<='z') {  // EBCDIC a-z = 0x81..0xa9 with two gaps, before A-Z
    593 #else
    594 #   error Unknown charset family!
    595 #endif
    596         // lowercase c
    597         if(c=='t') {
    598             /* tr or tur? */
    599             c=*locale++;
    600             if(is_u(c)) {
    601                 c=*locale++;
    602             }
    603             if(is_r(c)) {
    604                 c=*locale;
    605                 if(is_sep(c)) {
    606                     return UCASE_LOC_TURKISH;
    607                 }
    608             }
    609         } else if(c=='a') {
    610             /* az or aze? */
    611             c=*locale++;
    612             if(is_z(c)) {
    613                 c=*locale++;
    614                 if(is_e(c)) {
    615                     c=*locale;
    616                 }
    617                 if(is_sep(c)) {
    618                     return UCASE_LOC_TURKISH;
    619                 }
    620             }
    621         } else if(c=='l') {
    622             /* lt or lit? */
    623             c=*locale++;
    624             if(is_i(c)) {
    625                 c=*locale++;
    626             }
    627             if(is_t(c)) {
    628                 c=*locale;
    629                 if(is_sep(c)) {
    630                     return UCASE_LOC_LITHUANIAN;
    631                 }
    632             }
    633         } else if(c=='n') {
    634             /* nl or nld? */
    635             c=*locale++;
    636             if(is_l(c)) {
    637                 c=*locale++;
    638                 if(is_d(c)) {
    639                     c=*locale;
    640                 }
    641                 if(is_sep(c)) {
    642                     return UCASE_LOC_DUTCH;
    643                 }
    644             }
    645         }
    646     } else {
    647         // uppercase c
    648         // Same code as for lowercase c but also check for 'E'.
    649         if(c=='T') {
    650             /* tr or tur? */
    651             c=*locale++;
    652             if(is_u(c)) {
    653                 c=*locale++;
    654             }
    655             if(is_r(c)) {
    656                 c=*locale;
    657                 if(is_sep(c)) {
    658                     return UCASE_LOC_TURKISH;
    659                 }
    660             }
    661         } else if(c=='A') {
    662             /* az or aze? */
    663             c=*locale++;
    664             if(is_z(c)) {
    665                 c=*locale++;
    666                 if(is_e(c)) {
    667                     c=*locale;
    668                 }
    669                 if(is_sep(c)) {
    670                     return UCASE_LOC_TURKISH;
    671                 }
    672             }
    673         } else if(c=='L') {
    674             /* lt or lit? */
    675             c=*locale++;
    676             if(is_i(c)) {
    677                 c=*locale++;
    678             }
    679             if(is_t(c)) {
    680                 c=*locale;
    681                 if(is_sep(c)) {
    682                     return UCASE_LOC_LITHUANIAN;
    683                 }
    684             }
    685         } else if(c=='E') {
    686             /* el or ell? */
    687             c=*locale++;
    688             if(is_l(c)) {
    689                 c=*locale++;
    690                 if(is_l(c)) {
    691                     c=*locale;
    692                 }
    693                 if(is_sep(c)) {
    694                     return UCASE_LOC_GREEK;
    695                 }
    696             }
    697         } else if(c=='N') {
    698             /* nl or nld? */
    699             c=*locale++;
    700             if(is_l(c)) {
    701                 c=*locale++;
    702                 if(is_d(c)) {
    703                     c=*locale;
    704                 }
    705                 if(is_sep(c)) {
    706                     return UCASE_LOC_DUTCH;
    707                 }
    708             }
    709         }
    710     }
    711     return UCASE_LOC_ROOT;
    712 }
    713 
    714 /*
    715  * Is followed by
    716  *   {case-ignorable}* cased
    717  * ?
    718  * (dir determines looking forward/backward)
    719  * If a character is case-ignorable, it is skipped regardless of whether
    720  * it is also cased or not.
    721  */
    722 static UBool
    723 isFollowedByCasedLetter(UCaseContextIterator *iter, void *context, int8_t dir) {
    724     UChar32 c;
    725 
    726     if(iter==NULL) {
    727         return FALSE;
    728     }
    729 
    730     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
    731         int32_t type=ucase_getTypeOrIgnorable(c);
    732         if(type&4) {
    733             /* case-ignorable, continue with the loop */
    734         } else if(type!=UCASE_NONE) {
    735             return TRUE; /* followed by cased letter */
    736         } else {
    737             return FALSE; /* uncased and not case-ignorable */
    738         }
    739     }
    740 
    741     return FALSE; /* not followed by cased letter */
    742 }
    743 
    744 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
    745 static UBool
    746 isPrecededBySoftDotted(UCaseContextIterator *iter, void *context) {
    747     UChar32 c;
    748     int32_t dotType;
    749     int8_t dir;
    750 
    751     if(iter==NULL) {
    752         return FALSE;
    753     }
    754 
    755     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
    756         dotType=getDotType(c);
    757         if(dotType==UCASE_SOFT_DOTTED) {
    758             return TRUE; /* preceded by TYPE_i */
    759         } else if(dotType!=UCASE_OTHER_ACCENT) {
    760             return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
    761         }
    762     }
    763 
    764     return FALSE; /* not preceded by TYPE_i */
    765 }
    766 
    767 /*
    768  * See Jitterbug 2344:
    769  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
    770  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
    771  * we made those releases compatible with Unicode 3.2 which had not fixed
    772  * a related bug in SpecialCasing.txt.
    773  *
    774  * From the Jitterbug 2344 text:
    775  * ... this bug is listed as a Unicode erratum
    776  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
    777  * <quote>
    778  * There are two errors in SpecialCasing.txt.
    779  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
    780  * 2. An incorrect context definition. Correct as follows:
    781  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
    782  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
    783  * ---
    784  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
    785  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
    786  * where the context After_I is defined as:
    787  * The last preceding base character was an uppercase I, and there is no
    788  * intervening combining character class 230 (ABOVE).
    789  * </quote>
    790  *
    791  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
    792  *
    793  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
    794  * # This matches the behavior of the canonically equivalent I-dot_above
    795  *
    796  * See also the description in this place in older versions of uchar.c (revision 1.100).
    797  *
    798  * Markus W. Scherer 2003-feb-15
    799  */
    800 
    801 /* Is preceded by base character 'I' with no intervening cc=230 ? */
    802 static UBool
    803 isPrecededBy_I(UCaseContextIterator *iter, void *context) {
    804     UChar32 c;
    805     int32_t dotType;
    806     int8_t dir;
    807 
    808     if(iter==NULL) {
    809         return FALSE;
    810     }
    811 
    812     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
    813         if(c==0x49) {
    814             return TRUE; /* preceded by I */
    815         }
    816         dotType=getDotType(c);
    817         if(dotType!=UCASE_OTHER_ACCENT) {
    818             return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
    819         }
    820     }
    821 
    822     return FALSE; /* not preceded by I */
    823 }
    824 
    825 /* Is followed by one or more cc==230 ? */
    826 static UBool
    827 isFollowedByMoreAbove(UCaseContextIterator *iter, void *context) {
    828     UChar32 c;
    829     int32_t dotType;
    830     int8_t dir;
    831 
    832     if(iter==NULL) {
    833         return FALSE;
    834     }
    835 
    836     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
    837         dotType=getDotType(c);
    838         if(dotType==UCASE_ABOVE) {
    839             return TRUE; /* at least one cc==230 following */
    840         } else if(dotType!=UCASE_OTHER_ACCENT) {
    841             return FALSE; /* next base character, no more cc==230 following */
    842         }
    843     }
    844 
    845     return FALSE; /* no more cc==230 following */
    846 }
    847 
    848 /* Is followed by a dot above (without cc==230 in between) ? */
    849 static UBool
    850 isFollowedByDotAbove(UCaseContextIterator *iter, void *context) {
    851     UChar32 c;
    852     int32_t dotType;
    853     int8_t dir;
    854 
    855     if(iter==NULL) {
    856         return FALSE;
    857     }
    858 
    859     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
    860         if(c==0x307) {
    861             return TRUE;
    862         }
    863         dotType=getDotType(c);
    864         if(dotType!=UCASE_OTHER_ACCENT) {
    865             return FALSE; /* next base character or cc==230 in between */
    866         }
    867     }
    868 
    869     return FALSE; /* no dot above following */
    870 }
    871 
    872 U_CAPI int32_t U_EXPORT2
    873 ucase_toFullLower(UChar32 c,
    874                   UCaseContextIterator *iter, void *context,
    875                   const UChar **pString,
    876                   int32_t loc) {
    877     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
    878     U_ASSERT(c >= 0);
    879     UChar32 result=c;
    880     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
    881     if(!PROPS_HAS_EXCEPTION(props)) {
    882         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
    883             result=c+UCASE_GET_DELTA(props);
    884         }
    885     } else {
    886         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
    887         uint16_t excWord=*pe++;
    888         int32_t full;
    889 
    890         pe2=pe;
    891 
    892         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
    893             /* use hardcoded conditions and mappings */
    894 
    895             /*
    896              * Test for conditional mappings first
    897              *   (otherwise the unconditional default mappings are always taken),
    898              * then test for characters that have unconditional mappings in SpecialCasing.txt,
    899              * then get the UnicodeData.txt mappings.
    900              */
    901             if( loc==UCASE_LOC_LITHUANIAN &&
    902                     /* base characters, find accents above */
    903                     (((c==0x49 || c==0x4a || c==0x12e) &&
    904                         isFollowedByMoreAbove(iter, context)) ||
    905                     /* precomposed with accent above, no need to find one */
    906                     (c==0xcc || c==0xcd || c==0x128))
    907             ) {
    908                 /*
    909                     # Lithuanian
    910 
    911                     # Lithuanian retains the dot in a lowercase i when followed by accents.
    912 
    913                     # Introduce an explicit dot above when lowercasing capital I's and J's
    914                     # whenever there are more accents above.
    915                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
    916 
    917                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
    918                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
    919                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
    920                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
    921                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
    922                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
    923                  */
    924                 switch(c) {
    925                 case 0x49:  /* LATIN CAPITAL LETTER I */
    926                     *pString=iDot;
    927                     return 2;
    928                 case 0x4a:  /* LATIN CAPITAL LETTER J */
    929                     *pString=jDot;
    930                     return 2;
    931                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
    932                     *pString=iOgonekDot;
    933                     return 2;
    934                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
    935                     *pString=iDotGrave;
    936                     return 3;
    937                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
    938                     *pString=iDotAcute;
    939                     return 3;
    940                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
    941                     *pString=iDotTilde;
    942                     return 3;
    943                 default:
    944                     return 0; /* will not occur */
    945                 }
    946             /* # Turkish and Azeri */
    947             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
    948                 /*
    949                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
    950                     # The following rules handle those cases.
    951 
    952                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
    953                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
    954                  */
    955                 return 0x69;
    956             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(iter, context)) {
    957                 /*
    958                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
    959                     # This matches the behavior of the canonically equivalent I-dot_above
    960 
    961                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
    962                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
    963                  */
    964                 *pString=nullptr;
    965                 return 0; /* remove the dot (continue without output) */
    966             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(iter, context)) {
    967                 /*
    968                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
    969 
    970                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
    971                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
    972                  */
    973                 return 0x131;
    974             } else if(c==0x130) {
    975                 /*
    976                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
    977 
    978                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
    979                  */
    980                 *pString=iDot;
    981                 return 2;
    982             } else if(  c==0x3a3 &&
    983                         !isFollowedByCasedLetter(iter, context, 1) &&
    984                         isFollowedByCasedLetter(iter, context, -1) /* -1=preceded */
    985             ) {
    986                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
    987                 /*
    988                     # Special case for final form of sigma
    989 
    990                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
    991                  */
    992                 return 0x3c2; /* greek small final sigma */
    993             } else {
    994                 /* no known conditional special case mapping, use a normal mapping */
    995             }
    996         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
    997             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
    998             full&=UCASE_FULL_LOWER;
    999             if(full!=0) {
   1000                 /* set the output pointer to the lowercase mapping */
   1001                 *pString=reinterpret_cast<const UChar *>(pe+1);
   1002 
   1003                 /* return the string length */
   1004                 return full;
   1005             }
   1006         }
   1007 
   1008         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
   1009             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
   1010         }
   1011     }
   1012 
   1013     return (result==c) ? ~result : result;
   1014 }
   1015 
   1016 /* internal */
   1017 static int32_t
   1018 toUpperOrTitle(UChar32 c,
   1019                UCaseContextIterator *iter, void *context,
   1020                const UChar **pString,
   1021                int32_t loc,
   1022                UBool upperNotTitle) {
   1023     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
   1024     U_ASSERT(c >= 0);
   1025     UChar32 result=c;
   1026     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
   1027     if(!PROPS_HAS_EXCEPTION(props)) {
   1028         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
   1029             result=c+UCASE_GET_DELTA(props);
   1030         }
   1031     } else {
   1032         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
   1033         uint16_t excWord=*pe++;
   1034         int32_t full, idx;
   1035 
   1036         pe2=pe;
   1037 
   1038         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
   1039             /* use hardcoded conditions and mappings */
   1040             if(loc==UCASE_LOC_TURKISH && c==0x69) {
   1041                 /*
   1042                     # Turkish and Azeri
   1043 
   1044                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
   1045                     # The following rules handle those cases.
   1046 
   1047                     # When uppercasing, i turns into a dotted capital I
   1048 
   1049                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
   1050                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
   1051                 */
   1052                 return 0x130;
   1053             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(iter, context)) {
   1054                 /*
   1055                     # Lithuanian
   1056 
   1057                     # Lithuanian retains the dot in a lowercase i when followed by accents.
   1058 
   1059                     # Remove DOT ABOVE after "i" with upper or titlecase
   1060 
   1061                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
   1062                  */
   1063                 *pString=nullptr;
   1064                 return 0; /* remove the dot (continue without output) */
   1065             } else {
   1066                 /* no known conditional special case mapping, use a normal mapping */
   1067             }
   1068         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
   1069             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
   1070 
   1071             /* start of full case mapping strings */
   1072             ++pe;
   1073 
   1074             /* skip the lowercase and case-folding result strings */
   1075             pe+=full&UCASE_FULL_LOWER;
   1076             full>>=4;
   1077             pe+=full&0xf;
   1078             full>>=4;
   1079 
   1080             if(upperNotTitle) {
   1081                 full&=0xf;
   1082             } else {
   1083                 /* skip the uppercase result string */
   1084                 pe+=full&0xf;
   1085                 full=(full>>4)&0xf;
   1086             }
   1087 
   1088             if(full!=0) {
   1089                 /* set the output pointer to the result string */
   1090                 *pString=reinterpret_cast<const UChar *>(pe);
   1091 
   1092                 /* return the string length */
   1093                 return full;
   1094             }
   1095         }
   1096 
   1097         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
   1098             idx=UCASE_EXC_TITLE;
   1099         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
   1100             /* here, titlecase is same as uppercase */
   1101             idx=UCASE_EXC_UPPER;
   1102         } else {
   1103             return ~c;
   1104         }
   1105         GET_SLOT_VALUE(excWord, idx, pe2, result);
   1106     }
   1107 
   1108     return (result==c) ? ~result : result;
   1109 }
   1110 
   1111 U_CAPI int32_t U_EXPORT2
   1112 ucase_toFullUpper(UChar32 c,
   1113                   UCaseContextIterator *iter, void *context,
   1114                   const UChar **pString,
   1115                   int32_t caseLocale) {
   1116     return toUpperOrTitle(c, iter, context, pString, caseLocale, TRUE);
   1117 }
   1118 
   1119 U_CAPI int32_t U_EXPORT2
   1120 ucase_toFullTitle(UChar32 c,
   1121                   UCaseContextIterator *iter, void *context,
   1122                   const UChar **pString,
   1123                   int32_t caseLocale) {
   1124     return toUpperOrTitle(c, iter, context, pString, caseLocale, FALSE);
   1125 }
   1126 
   1127 /* case folding ------------------------------------------------------------- */
   1128 
   1129 /*
   1130  * Case folding is similar to lowercasing.
   1131  * The result may be a simple mapping, i.e., a single code point, or
   1132  * a full mapping, i.e., a string.
   1133  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
   1134  * then only the lowercase mapping is stored.
   1135  *
   1136  * Some special cases are hardcoded because their conditions cannot be
   1137  * parsed and processed from CaseFolding.txt.
   1138  *
   1139  * Unicode 3.2 CaseFolding.txt specifies for its status field:
   1140 
   1141 # C: common case folding, common mappings shared by both simple and full mappings.
   1142 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
   1143 # S: simple case folding, mappings to single characters where different from F.
   1144 # T: special case for uppercase I and dotted uppercase I
   1145 #    - For non-Turkic languages, this mapping is normally not used.
   1146 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
   1147 #
   1148 # Usage:
   1149 #  A. To do a simple case folding, use the mappings with status C + S.
   1150 #  B. To do a full case folding, use the mappings with status C + F.
   1151 #
   1152 #    The mappings with status T can be used or omitted depending on the desired case-folding
   1153 #    behavior. (The default option is to exclude them.)
   1154 
   1155  * Unicode 3.2 has 'T' mappings as follows:
   1156 
   1157 0049; T; 0131; # LATIN CAPITAL LETTER I
   1158 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1159 
   1160  * while the default mappings for these code points are:
   1161 
   1162 0049; C; 0069; # LATIN CAPITAL LETTER I
   1163 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1164 
   1165  * U+0130 has no simple case folding (simple-case-folds to itself).
   1166  */
   1167 
   1168 /* return the simple case folding mapping for c */
   1169 U_CAPI UChar32 U_EXPORT2
   1170 ucase_fold(UChar32 c, uint32_t options) {
   1171     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
   1172     if(!PROPS_HAS_EXCEPTION(props)) {
   1173         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
   1174             c+=UCASE_GET_DELTA(props);
   1175         }
   1176     } else {
   1177         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props);
   1178         uint16_t excWord=*pe++;
   1179         int32_t idx;
   1180         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
   1181             /* special case folding mappings, hardcoded */
   1182             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
   1183                 /* default mappings */
   1184                 if(c==0x49) {
   1185                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
   1186                     return 0x69;
   1187                 } else if(c==0x130) {
   1188                     /* no simple case folding for U+0130 */
   1189                     return c;
   1190                 }
   1191             } else {
   1192                 /* Turkic mappings */
   1193                 if(c==0x49) {
   1194                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
   1195                     return 0x131;
   1196                 } else if(c==0x130) {
   1197                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1198                     return 0x69;
   1199                 }
   1200             }
   1201         }
   1202         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
   1203             idx=UCASE_EXC_FOLD;
   1204         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
   1205             idx=UCASE_EXC_LOWER;
   1206         } else {
   1207             return c;
   1208         }
   1209         GET_SLOT_VALUE(excWord, idx, pe, c);
   1210     }
   1211     return c;
   1212 }
   1213 
   1214 /*
   1215  * Issue for canonical caseless match (UAX #21):
   1216  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
   1217  * canonical equivalence, unlike default-option casefolding.
   1218  * For example, I-grave and I + grave fold to strings that are not canonically
   1219  * equivalent.
   1220  * For more details, see the comment in unorm_compare() in unorm.cpp
   1221  * and the intermediate prototype changes for Jitterbug 2021.
   1222  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
   1223  *
   1224  * This did not get fixed because it appears that it is not possible to fix
   1225  * it for uppercase and lowercase characters (I-grave vs. i-grave)
   1226  * together in a way that they still fold to common result strings.
   1227  */
   1228 
   1229 U_CAPI int32_t U_EXPORT2
   1230 ucase_toFullFolding(UChar32 c,
   1231                     const UChar **pString,
   1232                     uint32_t options) {
   1233     // The sign of the result has meaning, input must be non-negative so that it can be returned as is.
   1234     U_ASSERT(c >= 0);
   1235     UChar32 result=c;
   1236     uint16_t props=UTRIE2_GET16(&ucase_props_singleton.trie, c);
   1237     if(!PROPS_HAS_EXCEPTION(props)) {
   1238         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
   1239             result=c+UCASE_GET_DELTA(props);
   1240         }
   1241     } else {
   1242         const uint16_t *pe=GET_EXCEPTIONS(&ucase_props_singleton, props), *pe2;
   1243         uint16_t excWord=*pe++;
   1244         int32_t full, idx;
   1245 
   1246         pe2=pe;
   1247 
   1248         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
   1249             /* use hardcoded conditions and mappings */
   1250             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
   1251                 /* default mappings */
   1252                 if(c==0x49) {
   1253                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
   1254                     return 0x69;
   1255                 } else if(c==0x130) {
   1256                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1257                     *pString=iDot;
   1258                     return 2;
   1259                 }
   1260             } else {
   1261                 /* Turkic mappings */
   1262                 if(c==0x49) {
   1263                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
   1264                     return 0x131;
   1265                 } else if(c==0x130) {
   1266                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1267                     return 0x69;
   1268                 }
   1269             }
   1270         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
   1271             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
   1272 
   1273             /* start of full case mapping strings */
   1274             ++pe;
   1275 
   1276             /* skip the lowercase result string */
   1277             pe+=full&UCASE_FULL_LOWER;
   1278             full=(full>>4)&0xf;
   1279 
   1280             if(full!=0) {
   1281                 /* set the output pointer to the result string */
   1282                 *pString=reinterpret_cast<const UChar *>(pe);
   1283 
   1284                 /* return the string length */
   1285                 return full;
   1286             }
   1287         }
   1288 
   1289         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
   1290             idx=UCASE_EXC_FOLD;
   1291         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
   1292             idx=UCASE_EXC_LOWER;
   1293         } else {
   1294             return ~c;
   1295         }
   1296         GET_SLOT_VALUE(excWord, idx, pe2, result);
   1297     }
   1298 
   1299     return (result==c) ? ~result : result;
   1300 }
   1301 
   1302 /* case mapping properties API ---------------------------------------------- */
   1303 
   1304 /* public API (see uchar.h) */
   1305 
   1306 U_CAPI UBool U_EXPORT2
   1307 u_isULowercase(UChar32 c) {
   1308     return (UBool)(UCASE_LOWER==ucase_getType(c));
   1309 }
   1310 
   1311 U_CAPI UBool U_EXPORT2
   1312 u_isUUppercase(UChar32 c) {
   1313     return (UBool)(UCASE_UPPER==ucase_getType(c));
   1314 }
   1315 
   1316 /* Transforms the Unicode character to its lower case equivalent.*/
   1317 U_CAPI UChar32 U_EXPORT2
   1318 u_tolower(UChar32 c) {
   1319     return ucase_tolower(c);
   1320 }
   1321 
   1322 /* Transforms the Unicode character to its upper case equivalent.*/
   1323 U_CAPI UChar32 U_EXPORT2
   1324 u_toupper(UChar32 c) {
   1325     return ucase_toupper(c);
   1326 }
   1327 
   1328 /* Transforms the Unicode character to its title case equivalent.*/
   1329 U_CAPI UChar32 U_EXPORT2
   1330 u_totitle(UChar32 c) {
   1331     return ucase_totitle(c);
   1332 }
   1333 
   1334 /* return the simple case folding mapping for c */
   1335 U_CAPI UChar32 U_EXPORT2
   1336 u_foldCase(UChar32 c, uint32_t options) {
   1337     return ucase_fold(c, options);
   1338 }
   1339 
   1340 U_CFUNC int32_t U_EXPORT2
   1341 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
   1342     /* case mapping properties */
   1343     const UChar *resultString;
   1344     switch(which) {
   1345     case UCHAR_LOWERCASE:
   1346         return (UBool)(UCASE_LOWER==ucase_getType(c));
   1347     case UCHAR_UPPERCASE:
   1348         return (UBool)(UCASE_UPPER==ucase_getType(c));
   1349     case UCHAR_SOFT_DOTTED:
   1350         return ucase_isSoftDotted(c);
   1351     case UCHAR_CASE_SENSITIVE:
   1352         return ucase_isCaseSensitive(c);
   1353     case UCHAR_CASED:
   1354         return (UBool)(UCASE_NONE!=ucase_getType(c));
   1355     case UCHAR_CASE_IGNORABLE:
   1356         return (UBool)(ucase_getTypeOrIgnorable(c)>>2);
   1357     /*
   1358      * Note: The following Changes_When_Xyz are defined as testing whether
   1359      * the NFD form of the input changes when Xyz-case-mapped.
   1360      * However, this simpler implementation of these properties,
   1361      * ignoring NFD, passes the tests.
   1362      * The implementation needs to be changed if the tests start failing.
   1363      * When that happens, optimizations should be used to work with the
   1364      * per-single-code point ucase_toFullXyz() functions unless
   1365      * the NFD form has more than one code point,
   1366      * and the property starts set needs to be the union of the
   1367      * start sets for normalization and case mappings.
   1368      */
   1369     case UCHAR_CHANGES_WHEN_LOWERCASED:
   1370         return (UBool)(ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
   1371     case UCHAR_CHANGES_WHEN_UPPERCASED:
   1372         return (UBool)(ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
   1373     case UCHAR_CHANGES_WHEN_TITLECASED:
   1374         return (UBool)(ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
   1375     /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
   1376     case UCHAR_CHANGES_WHEN_CASEMAPPED:
   1377         return (UBool)(
   1378             ucase_toFullLower(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
   1379             ucase_toFullUpper(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0 ||
   1380             ucase_toFullTitle(c, NULL, NULL, &resultString, UCASE_LOC_ROOT)>=0);
   1381     default:
   1382         return FALSE;
   1383     }
   1384 }
   1385