Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2004-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucase.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2004aug30
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Low-level Unicode character/string case mapping code.
     17 *   Much code moved here (and modified) from uchar.c.
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 #include "unicode/unistr.h"
     22 #include "unicode/uset.h"
     23 #include "unicode/udata.h" /* UDataInfo */
     24 #include "unicode/utf16.h"
     25 #include "ucmndata.h" /* DataHeader */
     26 #include "udatamem.h"
     27 #include "umutex.h"
     28 #include "uassert.h"
     29 #include "cmemory.h"
     30 #include "utrie2.h"
     31 #include "ucase.h"
     32 #include "ucln_cmn.h"
     33 
     34 struct UCaseProps {
     35     UDataMemory *mem;
     36     const int32_t *indexes;
     37     const uint16_t *exceptions;
     38     const uint16_t *unfold;
     39 
     40     UTrie2 trie;
     41     uint8_t formatVersion[4];
     42 };
     43 
     44 /* ucase_props_data.h is machine-generated by gencase --csource */
     45 #define INCLUDED_FROM_UCASE_CPP
     46 #include "ucase_props_data.h"
     47 
     48 /* UCaseProps singleton ----------------------------------------------------- */
     49 
     50 U_CAPI const UCaseProps * U_EXPORT2
     51 ucase_getSingleton() {
     52     return &ucase_props_singleton;
     53 }
     54 
     55 /* set of property starts for UnicodeSet ------------------------------------ */
     56 
     57 static UBool U_CALLCONV
     58 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
     59     /* add the start code point to the USet */
     60     const USetAdder *sa=(const USetAdder *)context;
     61     sa->add(sa->set, start);
     62     return TRUE;
     63 }
     64 
     65 U_CFUNC void U_EXPORT2
     66 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
     67     if(U_FAILURE(*pErrorCode)) {
     68         return;
     69     }
     70 
     71     /* add the start code point of each same-value range of the trie */
     72     utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
     73 
     74     /* add code points with hardcoded properties, plus the ones following them */
     75 
     76     /* (none right now, see comment below) */
     77 
     78     /*
     79      * Omit code points with hardcoded specialcasing properties
     80      * because we do not build property UnicodeSets for them right now.
     81      */
     82 }
     83 
     84 /* data access primitives --------------------------------------------------- */
     85 
     86 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
     87 
     88 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
     89 
     90 /* number of bits in an 8-bit integer value */
     91 static const uint8_t flagsOffset[256]={
     92     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
     93     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     94     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     95     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     96     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     97     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     98     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     99     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    100     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    101     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    102     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    103     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    104     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    105     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    106     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    107     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
    108 };
    109 
    110 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
    111 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
    112 
    113 /*
    114  * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
    115  *
    116  * @param excWord (in) initial exceptions word
    117  * @param idx (in) desired slot index
    118  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
    119  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
    120  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
    121  */
    122 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
    123     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
    124         (pExc16)+=SLOT_OFFSET(excWord, idx); \
    125         (value)=*pExc16; \
    126     } else { \
    127         (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
    128         (value)=*pExc16++; \
    129         (value)=((value)<<16)|*pExc16; \
    130     }
    131 
    132 /* simple case mappings ----------------------------------------------------- */
    133 
    134 U_CAPI UChar32 U_EXPORT2
    135 ucase_tolower(const UCaseProps *csp, UChar32 c) {
    136     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    137     if(!PROPS_HAS_EXCEPTION(props)) {
    138         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
    139             c+=UCASE_GET_DELTA(props);
    140         }
    141     } else {
    142         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
    143         uint16_t excWord=*pe++;
    144         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
    145             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
    146         }
    147     }
    148     return c;
    149 }
    150 
    151 U_CAPI UChar32 U_EXPORT2
    152 ucase_toupper(const UCaseProps *csp, UChar32 c) {
    153     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    154     if(!PROPS_HAS_EXCEPTION(props)) {
    155         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
    156             c+=UCASE_GET_DELTA(props);
    157         }
    158     } else {
    159         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
    160         uint16_t excWord=*pe++;
    161         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
    162             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
    163         }
    164     }
    165     return c;
    166 }
    167 
    168 U_CAPI UChar32 U_EXPORT2
    169 ucase_totitle(const UCaseProps *csp, UChar32 c) {
    170     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    171     if(!PROPS_HAS_EXCEPTION(props)) {
    172         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
    173             c+=UCASE_GET_DELTA(props);
    174         }
    175     } else {
    176         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
    177         uint16_t excWord=*pe++;
    178         int32_t idx;
    179         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
    180             idx=UCASE_EXC_TITLE;
    181         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
    182             idx=UCASE_EXC_UPPER;
    183         } else {
    184             return c;
    185         }
    186         GET_SLOT_VALUE(excWord, idx, pe, c);
    187     }
    188     return c;
    189 }
    190 
    191 static const UChar iDot[2] = { 0x69, 0x307 };
    192 static const UChar jDot[2] = { 0x6a, 0x307 };
    193 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
    194 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
    195 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
    196 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
    197 
    198 
    199 U_CFUNC void U_EXPORT2
    200 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
    201     uint16_t props;
    202 
    203     /*
    204      * Hardcode the case closure of i and its relatives and ignore the
    205      * data file data for these characters.
    206      * The Turkic dotless i and dotted I with their case mapping conditions
    207      * and case folding option make the related characters behave specially.
    208      * This code matches their closure behavior to their case folding behavior.
    209      */
    210 
    211     switch(c) {
    212     case 0x49:
    213         /* regular i and I are in one equivalence class */
    214         sa->add(sa->set, 0x69);
    215         return;
    216     case 0x69:
    217         sa->add(sa->set, 0x49);
    218         return;
    219     case 0x130:
    220         /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
    221         sa->addString(sa->set, iDot, 2);
    222         return;
    223     case 0x131:
    224         /* dotless i is in a class by itself */
    225         return;
    226     default:
    227         /* otherwise use the data file data */
    228         break;
    229     }
    230 
    231     props=UTRIE2_GET16(&csp->trie, c);
    232     if(!PROPS_HAS_EXCEPTION(props)) {
    233         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
    234             /* add the one simple case mapping, no matter what type it is */
    235             int32_t delta=UCASE_GET_DELTA(props);
    236             if(delta!=0) {
    237                 sa->add(sa->set, c+delta);
    238             }
    239         }
    240     } else {
    241         /*
    242          * c has exceptions, so there may be multiple simple and/or
    243          * full case mappings. Add them all.
    244          */
    245         const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
    246         const UChar *closure;
    247         uint16_t excWord=*pe++;
    248         int32_t idx, closureLength, fullLength, length;
    249 
    250         pe0=pe;
    251 
    252         /* add all simple case mappings */
    253         for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
    254             if(HAS_SLOT(excWord, idx)) {
    255                 pe=pe0;
    256                 GET_SLOT_VALUE(excWord, idx, pe, c);
    257                 sa->add(sa->set, c);
    258             }
    259         }
    260 
    261         /* get the closure string pointer & length */
    262         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
    263             pe=pe0;
    264             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
    265             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
    266             closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
    267         } else {
    268             closureLength=0;
    269             closure=NULL;
    270         }
    271 
    272         /* add the full case folding */
    273         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
    274             pe=pe0;
    275             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
    276 
    277             /* start of full case mapping strings */
    278             ++pe;
    279 
    280             fullLength&=0xffff; /* bits 16 and higher are reserved */
    281 
    282             /* skip the lowercase result string */
    283             pe+=fullLength&UCASE_FULL_LOWER;
    284             fullLength>>=4;
    285 
    286             /* add the full case folding string */
    287             length=fullLength&0xf;
    288             if(length!=0) {
    289                 sa->addString(sa->set, (const UChar *)pe, length);
    290                 pe+=length;
    291             }
    292 
    293             /* skip the uppercase and titlecase strings */
    294             fullLength>>=4;
    295             pe+=fullLength&0xf;
    296             fullLength>>=4;
    297             pe+=fullLength;
    298 
    299             closure=(const UChar *)pe; /* behind full case mappings */
    300         }
    301 
    302         /* add each code point in the closure string */
    303         for(idx=0; idx<closureLength;) {
    304             U16_NEXT_UNSAFE(closure, idx, c);
    305             sa->add(sa->set, c);
    306         }
    307     }
    308 }
    309 
    310 /*
    311  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
    312  * must be length>0 and max>0 and length<=max
    313  */
    314 static inline int32_t
    315 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
    316     int32_t c1, c2;
    317 
    318     max-=length; /* we require length<=max, so no need to decrement max in the loop */
    319     do {
    320         c1=*s++;
    321         c2=*t++;
    322         if(c2==0) {
    323             return 1; /* reached the end of t but not of s */
    324         }
    325         c1-=c2;
    326         if(c1!=0) {
    327             return c1; /* return difference result */
    328         }
    329     } while(--length>0);
    330     /* ends with length==0 */
    331 
    332     if(max==0 || *t==0) {
    333         return 0; /* equal to length of both strings */
    334     } else {
    335         return -max; /* return lengh difference */
    336     }
    337 }
    338 
    339 U_CFUNC UBool U_EXPORT2
    340 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
    341     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
    342 
    343     if(csp->unfold==NULL || s==NULL) {
    344         return FALSE; /* no reverse case folding data, or no string */
    345     }
    346     if(length<=1) {
    347         /* the string is too short to find any match */
    348         /*
    349          * more precise would be:
    350          * if(!u_strHasMoreChar32Than(s, length, 1))
    351          * but this does not make much practical difference because
    352          * a single supplementary code point would just not be found
    353          */
    354         return FALSE;
    355     }
    356 
    357     const uint16_t *unfold=csp->unfold;
    358     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
    359     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
    360     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
    361     unfold+=unfoldRowWidth;
    362 
    363     if(length>unfoldStringWidth) {
    364         /* the string is too long to find any match */
    365         return FALSE;
    366     }
    367 
    368     /* do a binary search for the string */
    369     start=0;
    370     limit=unfoldRows;
    371     while(start<limit) {
    372         i=(start+limit)/2;
    373         const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
    374         result=strcmpMax(s, length, p, unfoldStringWidth);
    375 
    376         if(result==0) {
    377             /* found the string: add each code point, and its case closure */
    378             UChar32 c;
    379 
    380             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
    381                 U16_NEXT_UNSAFE(p, i, c);
    382                 sa->add(sa->set, c);
    383                 ucase_addCaseClosure(csp, c, sa);
    384             }
    385             return TRUE;
    386         } else if(result<0) {
    387             limit=i;
    388         } else /* result>0 */ {
    389             start=i+1;
    390         }
    391     }
    392 
    393     return FALSE; /* string not found */
    394 }
    395 
    396 U_NAMESPACE_BEGIN
    397 
    398 FullCaseFoldingIterator::FullCaseFoldingIterator()
    399         : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
    400           unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
    401           unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
    402           unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
    403           currentRow(0),
    404           rowCpIndex(unfoldStringWidth) {
    405     unfold+=unfoldRowWidth;
    406 }
    407 
    408 UChar32
    409 FullCaseFoldingIterator::next(UnicodeString &full) {
    410     // Advance past the last-delivered code point.
    411     const UChar *p=unfold+(currentRow*unfoldRowWidth);
    412     if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
    413         ++currentRow;
    414         p+=unfoldRowWidth;
    415         rowCpIndex=unfoldStringWidth;
    416     }
    417     if(currentRow>=unfoldRows) { return U_SENTINEL; }
    418     // Set "full" to the NUL-terminated string in the first unfold column.
    419     int32_t length=unfoldStringWidth;
    420     while(length>0 && p[length-1]==0) { --length; }
    421     full.setTo(FALSE, p, length);
    422     // Return the code point.
    423     UChar32 c;
    424     U16_NEXT_UNSAFE(p, rowCpIndex, c);
    425     return c;
    426 }
    427 
    428 U_NAMESPACE_END
    429 
    430 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
    431 U_CAPI int32_t U_EXPORT2
    432 ucase_getType(const UCaseProps *csp, UChar32 c) {
    433     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    434     return UCASE_GET_TYPE(props);
    435 }
    436 
    437 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
    438 U_CAPI int32_t U_EXPORT2
    439 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
    440     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    441     return UCASE_GET_TYPE_AND_IGNORABLE(props);
    442 }
    443 
    444 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
    445 static inline int32_t
    446 getDotType(const UCaseProps *csp, UChar32 c) {
    447     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    448     if(!PROPS_HAS_EXCEPTION(props)) {
    449         return props&UCASE_DOT_MASK;
    450     } else {
    451         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
    452         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
    453     }
    454 }
    455 
    456 U_CAPI UBool U_EXPORT2
    457 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
    458     return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
    459 }
    460 
    461 U_CAPI UBool U_EXPORT2
    462 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
    463     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    464     return (UBool)((props&UCASE_SENSITIVE)!=0);
    465 }
    466 
    467 /* string casing ------------------------------------------------------------ */
    468 
    469 /*
    470  * These internal functions form the core of string case mappings.
    471  * They map single code points to result code points or strings and take
    472  * all necessary conditions (context, locale ID, options) into account.
    473  *
    474  * They do not iterate over the source or write to the destination
    475  * so that the same functions are useful for non-standard string storage,
    476  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
    477  * For the same reason, the "surrounding text" context is passed in as a
    478  * UCaseContextIterator which does not make any assumptions about
    479  * the underlying storage.
    480  *
    481  * This section contains helper functions that check for conditions
    482  * in the input text surrounding the current code point
    483  * according to SpecialCasing.txt.
    484  *
    485  * Each helper function gets the index
    486  * - after the current code point if it looks at following text
    487  * - before the current code point if it looks at preceding text
    488  *
    489  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
    490  *
    491  * Final_Sigma
    492  *   C is preceded by a sequence consisting of
    493  *     a cased letter and a case-ignorable sequence,
    494  *   and C is not followed by a sequence consisting of
    495  *     an ignorable sequence and then a cased letter.
    496  *
    497  * More_Above
    498  *   C is followed by one or more characters of combining class 230 (ABOVE)
    499  *   in the combining character sequence.
    500  *
    501  * After_Soft_Dotted
    502  *   The last preceding character with combining class of zero before C
    503  *   was Soft_Dotted,
    504  *   and there is no intervening combining character class 230 (ABOVE).
    505  *
    506  * Before_Dot
    507  *   C is followed by combining dot above (U+0307).
    508  *   Any sequence of characters with a combining class that is neither 0 nor 230
    509  *   may intervene between the current character and the combining dot above.
    510  *
    511  * The erratum from 2002-10-31 adds the condition
    512  *
    513  * After_I
    514  *   The last preceding base character was an uppercase I, and there is no
    515  *   intervening combining character class 230 (ABOVE).
    516  *
    517  *   (See Jitterbug 2344 and the comments on After_I below.)
    518  *
    519  * Helper definitions in Unicode 3.2 UAX 21:
    520  *
    521  * D1. A character C is defined to be cased
    522  *     if it meets any of the following criteria:
    523  *
    524  *   - The general category of C is Titlecase Letter (Lt)
    525  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
    526  *   - Given D = NFD(C), then it is not the case that:
    527  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
    528  *     (This third criterium does not add any characters to the list
    529  *      for Unicode 3.2. Ignored.)
    530  *
    531  * D2. A character C is defined to be case-ignorable
    532  *     if it meets either of the following criteria:
    533  *
    534  *   - The general category of C is
    535  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
    536  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
    537  *   - C is one of the following characters
    538  *     U+0027 APOSTROPHE
    539  *     U+00AD SOFT HYPHEN (SHY)
    540  *     U+2019 RIGHT SINGLE QUOTATION MARK
    541  *            (the preferred character for apostrophe)
    542  *
    543  * D3. A case-ignorable sequence is a sequence of
    544  *     zero or more case-ignorable characters.
    545  */
    546 
    547 #define is_a(c) ((c)=='a' || (c)=='A')
    548 #define is_d(c) ((c)=='d' || (c)=='D')
    549 #define is_e(c) ((c)=='e' || (c)=='E')
    550 #define is_i(c) ((c)=='i' || (c)=='I')
    551 #define is_l(c) ((c)=='l' || (c)=='L')
    552 #define is_n(c) ((c)=='n' || (c)=='N')
    553 #define is_r(c) ((c)=='r' || (c)=='R')
    554 #define is_t(c) ((c)=='t' || (c)=='T')
    555 #define is_u(c) ((c)=='u' || (c)=='U')
    556 #define is_z(c) ((c)=='z' || (c)=='Z')
    557 
    558 /* separator? */
    559 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
    560 
    561 /**
    562  * Requires non-NULL locale ID but otherwise does the equivalent of
    563  * checking for language codes as if uloc_getLanguage() were called:
    564  * Accepts both 2- and 3-letter codes and accepts case variants.
    565  */
    566 U_CFUNC int32_t
    567 ucase_getCaseLocale(const char *locale, int32_t *locCache) {
    568     int32_t result;
    569     char c;
    570 
    571     if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
    572         return result;
    573     }
    574 
    575     result=UCASE_LOC_ROOT;
    576 
    577     /*
    578      * This function used to use uloc_getLanguage(), but the current code
    579      * removes the dependency of this low-level code on uloc implementation code
    580      * and is faster because not the whole locale ID has to be
    581      * examined and copied/transformed.
    582      *
    583      * Because this code does not want to depend on uloc, the caller must
    584      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
    585      */
    586     c=*locale++;
    587     if(is_t(c)) {
    588         /* tr or tur? */
    589         c=*locale++;
    590         if(is_u(c)) {
    591             c=*locale++;
    592         }
    593         if(is_r(c)) {
    594             c=*locale;
    595             if(is_sep(c)) {
    596                 result=UCASE_LOC_TURKISH;
    597             }
    598         }
    599     } else if(is_a(c)) {
    600         /* az or aze? */
    601         c=*locale++;
    602         if(is_z(c)) {
    603             c=*locale++;
    604             if(is_e(c)) {
    605                 c=*locale;
    606             }
    607             if(is_sep(c)) {
    608                 result=UCASE_LOC_TURKISH;
    609             }
    610         }
    611     } else if(is_l(c)) {
    612         /* lt or lit? */
    613         c=*locale++;
    614         if(is_i(c)) {
    615             c=*locale++;
    616         }
    617         if(is_t(c)) {
    618             c=*locale;
    619             if(is_sep(c)) {
    620                 result=UCASE_LOC_LITHUANIAN;
    621             }
    622         }
    623     } else if(is_n(c)) {
    624         /* nl or nld? */
    625         c=*locale++;
    626         if(is_l(c)) {
    627             c=*locale++;
    628             if(is_d(c)) {
    629                 c=*locale;
    630             }
    631             if(is_sep(c)) {
    632                 result=UCASE_LOC_DUTCH;
    633             }
    634         }
    635     }
    636 
    637     if(locCache!=NULL) {
    638         *locCache=result;
    639     }
    640     return result;
    641 }
    642 
    643 /*
    644  * Is followed by
    645  *   {case-ignorable}* cased
    646  * ?
    647  * (dir determines looking forward/backward)
    648  * If a character is case-ignorable, it is skipped regardless of whether
    649  * it is also cased or not.
    650  */
    651 static UBool
    652 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
    653     UChar32 c;
    654 
    655     if(iter==NULL) {
    656         return FALSE;
    657     }
    658 
    659     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
    660         int32_t type=ucase_getTypeOrIgnorable(csp, c);
    661         if(type&4) {
    662             /* case-ignorable, continue with the loop */
    663         } else if(type!=UCASE_NONE) {
    664             return TRUE; /* followed by cased letter */
    665         } else {
    666             return FALSE; /* uncased and not case-ignorable */
    667         }
    668     }
    669 
    670     return FALSE; /* not followed by cased letter */
    671 }
    672 
    673 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
    674 static UBool
    675 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
    676     UChar32 c;
    677     int32_t dotType;
    678     int8_t dir;
    679 
    680     if(iter==NULL) {
    681         return FALSE;
    682     }
    683 
    684     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
    685         dotType=getDotType(csp, c);
    686         if(dotType==UCASE_SOFT_DOTTED) {
    687             return TRUE; /* preceded by TYPE_i */
    688         } else if(dotType!=UCASE_OTHER_ACCENT) {
    689             return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
    690         }
    691     }
    692 
    693     return FALSE; /* not preceded by TYPE_i */
    694 }
    695 
    696 /*
    697  * See Jitterbug 2344:
    698  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
    699  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
    700  * we made those releases compatible with Unicode 3.2 which had not fixed
    701  * a related bug in SpecialCasing.txt.
    702  *
    703  * From the Jitterbug 2344 text:
    704  * ... this bug is listed as a Unicode erratum
    705  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
    706  * <quote>
    707  * There are two errors in SpecialCasing.txt.
    708  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
    709  * 2. An incorrect context definition. Correct as follows:
    710  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
    711  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
    712  * ---
    713  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
    714  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
    715  * where the context After_I is defined as:
    716  * The last preceding base character was an uppercase I, and there is no
    717  * intervening combining character class 230 (ABOVE).
    718  * </quote>
    719  *
    720  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
    721  *
    722  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
    723  * # This matches the behavior of the canonically equivalent I-dot_above
    724  *
    725  * See also the description in this place in older versions of uchar.c (revision 1.100).
    726  *
    727  * Markus W. Scherer 2003-feb-15
    728  */
    729 
    730 /* Is preceded by base character 'I' with no intervening cc=230 ? */
    731 static UBool
    732 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
    733     UChar32 c;
    734     int32_t dotType;
    735     int8_t dir;
    736 
    737     if(iter==NULL) {
    738         return FALSE;
    739     }
    740 
    741     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
    742         if(c==0x49) {
    743             return TRUE; /* preceded by I */
    744         }
    745         dotType=getDotType(csp, c);
    746         if(dotType!=UCASE_OTHER_ACCENT) {
    747             return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
    748         }
    749     }
    750 
    751     return FALSE; /* not preceded by I */
    752 }
    753 
    754 /* Is followed by one or more cc==230 ? */
    755 static UBool
    756 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
    757     UChar32 c;
    758     int32_t dotType;
    759     int8_t dir;
    760 
    761     if(iter==NULL) {
    762         return FALSE;
    763     }
    764 
    765     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
    766         dotType=getDotType(csp, c);
    767         if(dotType==UCASE_ABOVE) {
    768             return TRUE; /* at least one cc==230 following */
    769         } else if(dotType!=UCASE_OTHER_ACCENT) {
    770             return FALSE; /* next base character, no more cc==230 following */
    771         }
    772     }
    773 
    774     return FALSE; /* no more cc==230 following */
    775 }
    776 
    777 /* Is followed by a dot above (without cc==230 in between) ? */
    778 static UBool
    779 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
    780     UChar32 c;
    781     int32_t dotType;
    782     int8_t dir;
    783 
    784     if(iter==NULL) {
    785         return FALSE;
    786     }
    787 
    788     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
    789         if(c==0x307) {
    790             return TRUE;
    791         }
    792         dotType=getDotType(csp, c);
    793         if(dotType!=UCASE_OTHER_ACCENT) {
    794             return FALSE; /* next base character or cc==230 in between */
    795         }
    796     }
    797 
    798     return FALSE; /* no dot above following */
    799 }
    800 
    801 U_CAPI int32_t U_EXPORT2
    802 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
    803                   UCaseContextIterator *iter, void *context,
    804                   const UChar **pString,
    805                   const char *locale, int32_t *locCache)
    806 {
    807     UChar32 result=c;
    808     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    809     if(!PROPS_HAS_EXCEPTION(props)) {
    810         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
    811             result=c+UCASE_GET_DELTA(props);
    812         }
    813     } else {
    814         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
    815         uint16_t excWord=*pe++;
    816         int32_t full;
    817 
    818         pe2=pe;
    819 
    820         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
    821             /* use hardcoded conditions and mappings */
    822             int32_t loc=ucase_getCaseLocale(locale, locCache);
    823 
    824             /*
    825              * Test for conditional mappings first
    826              *   (otherwise the unconditional default mappings are always taken),
    827              * then test for characters that have unconditional mappings in SpecialCasing.txt,
    828              * then get the UnicodeData.txt mappings.
    829              */
    830             if( loc==UCASE_LOC_LITHUANIAN &&
    831                     /* base characters, find accents above */
    832                     (((c==0x49 || c==0x4a || c==0x12e) &&
    833                         isFollowedByMoreAbove(csp, iter, context)) ||
    834                     /* precomposed with accent above, no need to find one */
    835                     (c==0xcc || c==0xcd || c==0x128))
    836             ) {
    837                 /*
    838                     # Lithuanian
    839 
    840                     # Lithuanian retains the dot in a lowercase i when followed by accents.
    841 
    842                     # Introduce an explicit dot above when lowercasing capital I's and J's
    843                     # whenever there are more accents above.
    844                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
    845 
    846                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
    847                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
    848                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
    849                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
    850                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
    851                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
    852                  */
    853                 switch(c) {
    854                 case 0x49:  /* LATIN CAPITAL LETTER I */
    855                     *pString=iDot;
    856                     return 2;
    857                 case 0x4a:  /* LATIN CAPITAL LETTER J */
    858                     *pString=jDot;
    859                     return 2;
    860                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
    861                     *pString=iOgonekDot;
    862                     return 2;
    863                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
    864                     *pString=iDotGrave;
    865                     return 3;
    866                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
    867                     *pString=iDotAcute;
    868                     return 3;
    869                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
    870                     *pString=iDotTilde;
    871                     return 3;
    872                 default:
    873                     return 0; /* will not occur */
    874                 }
    875             /* # Turkish and Azeri */
    876             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
    877                 /*
    878                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
    879                     # The following rules handle those cases.
    880 
    881                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
    882                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
    883                  */
    884                 return 0x69;
    885             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
    886                 /*
    887                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
    888                     # This matches the behavior of the canonically equivalent I-dot_above
    889 
    890                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
    891                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
    892                  */
    893                 return 0; /* remove the dot (continue without output) */
    894             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
    895                 /*
    896                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
    897 
    898                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
    899                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
    900                  */
    901                 return 0x131;
    902             } else if(c==0x130) {
    903                 /*
    904                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
    905 
    906                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
    907                  */
    908                 *pString=iDot;
    909                 return 2;
    910             } else if(  c==0x3a3 &&
    911                         !isFollowedByCasedLetter(csp, iter, context, 1) &&
    912                         isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
    913             ) {
    914                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
    915                 /*
    916                     # Special case for final form of sigma
    917 
    918                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
    919                  */
    920                 return 0x3c2; /* greek small final sigma */
    921             } else {
    922                 /* no known conditional special case mapping, use a normal mapping */
    923             }
    924         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
    925             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
    926             full&=UCASE_FULL_LOWER;
    927             if(full!=0) {
    928                 /* set the output pointer to the lowercase mapping */
    929                 *pString=reinterpret_cast<const UChar *>(pe+1);
    930 
    931                 /* return the string length */
    932                 return full;
    933             }
    934         }
    935 
    936         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
    937             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
    938         }
    939     }
    940 
    941     return (result==c) ? ~result : result;
    942 }
    943 
    944 /* internal */
    945 static int32_t
    946 toUpperOrTitle(const UCaseProps *csp, UChar32 c,
    947                UCaseContextIterator *iter, void *context,
    948                const UChar **pString,
    949                const char *locale, int32_t *locCache,
    950                UBool upperNotTitle) {
    951     UChar32 result=c;
    952     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    953     if(!PROPS_HAS_EXCEPTION(props)) {
    954         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
    955             result=c+UCASE_GET_DELTA(props);
    956         }
    957     } else {
    958         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
    959         uint16_t excWord=*pe++;
    960         int32_t full, idx;
    961 
    962         pe2=pe;
    963 
    964         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
    965             /* use hardcoded conditions and mappings */
    966             int32_t loc=ucase_getCaseLocale(locale, locCache);
    967 
    968             if(loc==UCASE_LOC_TURKISH && c==0x69) {
    969                 /*
    970                     # Turkish and Azeri
    971 
    972                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
    973                     # The following rules handle those cases.
    974 
    975                     # When uppercasing, i turns into a dotted capital I
    976 
    977                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
    978                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
    979                 */
    980                 return 0x130;
    981             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
    982                 /*
    983                     # Lithuanian
    984 
    985                     # Lithuanian retains the dot in a lowercase i when followed by accents.
    986 
    987                     # Remove DOT ABOVE after "i" with upper or titlecase
    988 
    989                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
    990                  */
    991                 return 0; /* remove the dot (continue without output) */
    992             } else {
    993                 /* no known conditional special case mapping, use a normal mapping */
    994             }
    995         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
    996             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
    997 
    998             /* start of full case mapping strings */
    999             ++pe;
   1000 
   1001             /* skip the lowercase and case-folding result strings */
   1002             pe+=full&UCASE_FULL_LOWER;
   1003             full>>=4;
   1004             pe+=full&0xf;
   1005             full>>=4;
   1006 
   1007             if(upperNotTitle) {
   1008                 full&=0xf;
   1009             } else {
   1010                 /* skip the uppercase result string */
   1011                 pe+=full&0xf;
   1012                 full=(full>>4)&0xf;
   1013             }
   1014 
   1015             if(full!=0) {
   1016                 /* set the output pointer to the result string */
   1017                 *pString=reinterpret_cast<const UChar *>(pe);
   1018 
   1019                 /* return the string length */
   1020                 return full;
   1021             }
   1022         }
   1023 
   1024         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
   1025             idx=UCASE_EXC_TITLE;
   1026         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
   1027             /* here, titlecase is same as uppercase */
   1028             idx=UCASE_EXC_UPPER;
   1029         } else {
   1030             return ~c;
   1031         }
   1032         GET_SLOT_VALUE(excWord, idx, pe2, result);
   1033     }
   1034 
   1035     return (result==c) ? ~result : result;
   1036 }
   1037 
   1038 U_CAPI int32_t U_EXPORT2
   1039 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
   1040                   UCaseContextIterator *iter, void *context,
   1041                   const UChar **pString,
   1042                   const char *locale, int32_t *locCache) {
   1043     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
   1044 }
   1045 
   1046 U_CAPI int32_t U_EXPORT2
   1047 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
   1048                   UCaseContextIterator *iter, void *context,
   1049                   const UChar **pString,
   1050                   const char *locale, int32_t *locCache) {
   1051     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
   1052 }
   1053 
   1054 /* case folding ------------------------------------------------------------- */
   1055 
   1056 /*
   1057  * Case folding is similar to lowercasing.
   1058  * The result may be a simple mapping, i.e., a single code point, or
   1059  * a full mapping, i.e., a string.
   1060  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
   1061  * then only the lowercase mapping is stored.
   1062  *
   1063  * Some special cases are hardcoded because their conditions cannot be
   1064  * parsed and processed from CaseFolding.txt.
   1065  *
   1066  * Unicode 3.2 CaseFolding.txt specifies for its status field:
   1067 
   1068 # C: common case folding, common mappings shared by both simple and full mappings.
   1069 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
   1070 # S: simple case folding, mappings to single characters where different from F.
   1071 # T: special case for uppercase I and dotted uppercase I
   1072 #    - For non-Turkic languages, this mapping is normally not used.
   1073 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
   1074 #
   1075 # Usage:
   1076 #  A. To do a simple case folding, use the mappings with status C + S.
   1077 #  B. To do a full case folding, use the mappings with status C + F.
   1078 #
   1079 #    The mappings with status T can be used or omitted depending on the desired case-folding
   1080 #    behavior. (The default option is to exclude them.)
   1081 
   1082  * Unicode 3.2 has 'T' mappings as follows:
   1083 
   1084 0049; T; 0131; # LATIN CAPITAL LETTER I
   1085 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1086 
   1087  * while the default mappings for these code points are:
   1088 
   1089 0049; C; 0069; # LATIN CAPITAL LETTER I
   1090 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1091 
   1092  * U+0130 has no simple case folding (simple-case-folds to itself).
   1093  */
   1094 
   1095 /* return the simple case folding mapping for c */
   1096 U_CAPI UChar32 U_EXPORT2
   1097 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
   1098     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   1099     if(!PROPS_HAS_EXCEPTION(props)) {
   1100         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
   1101             c+=UCASE_GET_DELTA(props);
   1102         }
   1103     } else {
   1104         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
   1105         uint16_t excWord=*pe++;
   1106         int32_t idx;
   1107         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
   1108             /* special case folding mappings, hardcoded */
   1109             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
   1110                 /* default mappings */
   1111                 if(c==0x49) {
   1112                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
   1113                     return 0x69;
   1114                 } else if(c==0x130) {
   1115                     /* no simple case folding for U+0130 */
   1116                     return c;
   1117                 }
   1118             } else {
   1119                 /* Turkic mappings */
   1120                 if(c==0x49) {
   1121                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
   1122                     return 0x131;
   1123                 } else if(c==0x130) {
   1124                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1125                     return 0x69;
   1126                 }
   1127             }
   1128         }
   1129         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
   1130             idx=UCASE_EXC_FOLD;
   1131         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
   1132             idx=UCASE_EXC_LOWER;
   1133         } else {
   1134             return c;
   1135         }
   1136         GET_SLOT_VALUE(excWord, idx, pe, c);
   1137     }
   1138     return c;
   1139 }
   1140 
   1141 /*
   1142  * Issue for canonical caseless match (UAX #21):
   1143  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
   1144  * canonical equivalence, unlike default-option casefolding.
   1145  * For example, I-grave and I + grave fold to strings that are not canonically
   1146  * equivalent.
   1147  * For more details, see the comment in unorm_compare() in unorm.cpp
   1148  * and the intermediate prototype changes for Jitterbug 2021.
   1149  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
   1150  *
   1151  * This did not get fixed because it appears that it is not possible to fix
   1152  * it for uppercase and lowercase characters (I-grave vs. i-grave)
   1153  * together in a way that they still fold to common result strings.
   1154  */
   1155 
   1156 U_CAPI int32_t U_EXPORT2
   1157 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
   1158                     const UChar **pString,
   1159                     uint32_t options)
   1160 {
   1161     UChar32 result=c;
   1162     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   1163     if(!PROPS_HAS_EXCEPTION(props)) {
   1164         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
   1165             result=c+UCASE_GET_DELTA(props);
   1166         }
   1167     } else {
   1168         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
   1169         uint16_t excWord=*pe++;
   1170         int32_t full, idx;
   1171 
   1172         pe2=pe;
   1173 
   1174         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
   1175             /* use hardcoded conditions and mappings */
   1176             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
   1177                 /* default mappings */
   1178                 if(c==0x49) {
   1179                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
   1180                     return 0x69;
   1181                 } else if(c==0x130) {
   1182                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1183                     *pString=iDot;
   1184                     return 2;
   1185                 }
   1186             } else {
   1187                 /* Turkic mappings */
   1188                 if(c==0x49) {
   1189                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
   1190                     return 0x131;
   1191                 } else if(c==0x130) {
   1192                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1193                     return 0x69;
   1194                 }
   1195             }
   1196         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
   1197             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
   1198 
   1199             /* start of full case mapping strings */
   1200             ++pe;
   1201 
   1202             /* skip the lowercase result string */
   1203             pe+=full&UCASE_FULL_LOWER;
   1204             full=(full>>4)&0xf;
   1205 
   1206             if(full!=0) {
   1207                 /* set the output pointer to the result string */
   1208                 *pString=reinterpret_cast<const UChar *>(pe);
   1209 
   1210                 /* return the string length */
   1211                 return full;
   1212             }
   1213         }
   1214 
   1215         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
   1216             idx=UCASE_EXC_FOLD;
   1217         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
   1218             idx=UCASE_EXC_LOWER;
   1219         } else {
   1220             return ~c;
   1221         }
   1222         GET_SLOT_VALUE(excWord, idx, pe2, result);
   1223     }
   1224 
   1225     return (result==c) ? ~result : result;
   1226 }
   1227 
   1228 /* case mapping properties API ---------------------------------------------- */
   1229 
   1230 #define GET_CASE_PROPS() &ucase_props_singleton
   1231 
   1232 /* public API (see uchar.h) */
   1233 
   1234 U_CAPI UBool U_EXPORT2
   1235 u_isULowercase(UChar32 c) {
   1236     return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
   1237 }
   1238 
   1239 U_CAPI UBool U_EXPORT2
   1240 u_isUUppercase(UChar32 c) {
   1241     return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
   1242 }
   1243 
   1244 /* Transforms the Unicode character to its lower case equivalent.*/
   1245 U_CAPI UChar32 U_EXPORT2
   1246 u_tolower(UChar32 c) {
   1247     return ucase_tolower(GET_CASE_PROPS(), c);
   1248 }
   1249 
   1250 /* Transforms the Unicode character to its upper case equivalent.*/
   1251 U_CAPI UChar32 U_EXPORT2
   1252 u_toupper(UChar32 c) {
   1253     return ucase_toupper(GET_CASE_PROPS(), c);
   1254 }
   1255 
   1256 /* Transforms the Unicode character to its title case equivalent.*/
   1257 U_CAPI UChar32 U_EXPORT2
   1258 u_totitle(UChar32 c) {
   1259     return ucase_totitle(GET_CASE_PROPS(), c);
   1260 }
   1261 
   1262 /* return the simple case folding mapping for c */
   1263 U_CAPI UChar32 U_EXPORT2
   1264 u_foldCase(UChar32 c, uint32_t options) {
   1265     return ucase_fold(GET_CASE_PROPS(), c, options);
   1266 }
   1267 
   1268 U_CFUNC int32_t U_EXPORT2
   1269 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
   1270     /* case mapping properties */
   1271     const UChar *resultString;
   1272     int32_t locCache;
   1273     const UCaseProps *csp=GET_CASE_PROPS();
   1274     if(csp==NULL) {
   1275         return FALSE;
   1276     }
   1277     switch(which) {
   1278     case UCHAR_LOWERCASE:
   1279         return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
   1280     case UCHAR_UPPERCASE:
   1281         return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
   1282     case UCHAR_SOFT_DOTTED:
   1283         return ucase_isSoftDotted(csp, c);
   1284     case UCHAR_CASE_SENSITIVE:
   1285         return ucase_isCaseSensitive(csp, c);
   1286     case UCHAR_CASED:
   1287         return (UBool)(UCASE_NONE!=ucase_getType(csp, c));
   1288     case UCHAR_CASE_IGNORABLE:
   1289         return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2);
   1290     /*
   1291      * Note: The following Changes_When_Xyz are defined as testing whether
   1292      * the NFD form of the input changes when Xyz-case-mapped.
   1293      * However, this simpler implementation of these properties,
   1294      * ignoring NFD, passes the tests.
   1295      * The implementation needs to be changed if the tests start failing.
   1296      * When that happens, optimizations should be used to work with the
   1297      * per-single-code point ucase_toFullXyz() functions unless
   1298      * the NFD form has more than one code point,
   1299      * and the property starts set needs to be the union of the
   1300      * start sets for normalization and case mappings.
   1301      */
   1302     case UCHAR_CHANGES_WHEN_LOWERCASED:
   1303         locCache=UCASE_LOC_ROOT;
   1304         return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
   1305     case UCHAR_CHANGES_WHEN_UPPERCASED:
   1306         locCache=UCASE_LOC_ROOT;
   1307         return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
   1308     case UCHAR_CHANGES_WHEN_TITLECASED:
   1309         locCache=UCASE_LOC_ROOT;
   1310         return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
   1311     /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
   1312     case UCHAR_CHANGES_WHEN_CASEMAPPED:
   1313         locCache=UCASE_LOC_ROOT;
   1314         return (UBool)(
   1315             ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
   1316             ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
   1317             ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
   1318     default:
   1319         return FALSE;
   1320     }
   1321 }
   1322