Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2004-2014, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucase.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2004aug30
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Low-level Unicode character/string case mapping code.
     17 *   Much code moved here (and modified) from uchar.c.
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 #include "unicode/unistr.h"
     22 #include "unicode/uset.h"
     23 #include "unicode/udata.h" /* UDataInfo */
     24 #include "unicode/utf16.h"
     25 #include "ucmndata.h" /* DataHeader */
     26 #include "udatamem.h"
     27 #include "umutex.h"
     28 #include "uassert.h"
     29 #include "cmemory.h"
     30 #include "utrie2.h"
     31 #include "ucase.h"
     32 
     33 struct UCaseProps {
     34     UDataMemory *mem;
     35     const int32_t *indexes;
     36     const uint16_t *exceptions;
     37     const uint16_t *unfold;
     38 
     39     UTrie2 trie;
     40     uint8_t formatVersion[4];
     41 };
     42 
     43 /* ucase_props_data.h is machine-generated by gencase --csource */
     44 #define INCLUDED_FROM_UCASE_CPP
     45 #include "ucase_props_data.h"
     46 
     47 /* UCaseProps singleton ----------------------------------------------------- */
     48 
     49 U_CAPI const UCaseProps * U_EXPORT2
     50 ucase_getSingleton() {
     51     return &ucase_props_singleton;
     52 }
     53 
     54 /* set of property starts for UnicodeSet ------------------------------------ */
     55 
     56 static UBool U_CALLCONV
     57 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
     58     /* add the start code point to the USet */
     59     const USetAdder *sa=(const USetAdder *)context;
     60     sa->add(sa->set, start);
     61     return TRUE;
     62 }
     63 
     64 U_CFUNC void U_EXPORT2
     65 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
     66     if(U_FAILURE(*pErrorCode)) {
     67         return;
     68     }
     69 
     70     /* add the start code point of each same-value range of the trie */
     71     utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
     72 
     73     /* add code points with hardcoded properties, plus the ones following them */
     74 
     75     /* (none right now, see comment below) */
     76 
     77     /*
     78      * Omit code points with hardcoded specialcasing properties
     79      * because we do not build property UnicodeSets for them right now.
     80      */
     81 }
     82 
     83 /* data access primitives --------------------------------------------------- */
     84 
     85 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
     86 
     87 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
     88 
     89 /* number of bits in an 8-bit integer value */
     90 static const uint8_t flagsOffset[256]={
     91     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
     92     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     93     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     94     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     95     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
     96     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     97     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
     98     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
     99     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    100     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    101     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    102     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    103     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    104     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    105     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    106     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
    107 };
    108 
    109 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
    110 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
    111 
    112 /*
    113  * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
    114  *
    115  * @param excWord (in) initial exceptions word
    116  * @param idx (in) desired slot index
    117  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
    118  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
    119  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
    120  */
    121 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
    122     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
    123         (pExc16)+=SLOT_OFFSET(excWord, idx); \
    124         (value)=*pExc16; \
    125     } else { \
    126         (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
    127         (value)=*pExc16++; \
    128         (value)=((value)<<16)|*pExc16; \
    129     }
    130 
    131 /* simple case mappings ----------------------------------------------------- */
    132 
    133 U_CAPI UChar32 U_EXPORT2
    134 ucase_tolower(const UCaseProps *csp, UChar32 c) {
    135     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    136     if(!PROPS_HAS_EXCEPTION(props)) {
    137         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
    138             c+=UCASE_GET_DELTA(props);
    139         }
    140     } else {
    141         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
    142         uint16_t excWord=*pe++;
    143         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
    144             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
    145         }
    146     }
    147     return c;
    148 }
    149 
    150 U_CAPI UChar32 U_EXPORT2
    151 ucase_toupper(const UCaseProps *csp, UChar32 c) {
    152     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    153     if(!PROPS_HAS_EXCEPTION(props)) {
    154         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
    155             c+=UCASE_GET_DELTA(props);
    156         }
    157     } else {
    158         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
    159         uint16_t excWord=*pe++;
    160         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
    161             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
    162         }
    163     }
    164     return c;
    165 }
    166 
    167 U_CAPI UChar32 U_EXPORT2
    168 ucase_totitle(const UCaseProps *csp, UChar32 c) {
    169     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    170     if(!PROPS_HAS_EXCEPTION(props)) {
    171         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
    172             c+=UCASE_GET_DELTA(props);
    173         }
    174     } else {
    175         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
    176         uint16_t excWord=*pe++;
    177         int32_t idx;
    178         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
    179             idx=UCASE_EXC_TITLE;
    180         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
    181             idx=UCASE_EXC_UPPER;
    182         } else {
    183             return c;
    184         }
    185         GET_SLOT_VALUE(excWord, idx, pe, c);
    186     }
    187     return c;
    188 }
    189 
    190 static const UChar iDot[2] = { 0x69, 0x307 };
    191 static const UChar jDot[2] = { 0x6a, 0x307 };
    192 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
    193 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
    194 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
    195 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
    196 
    197 
    198 U_CFUNC void U_EXPORT2
    199 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
    200     uint16_t props;
    201 
    202     /*
    203      * Hardcode the case closure of i and its relatives and ignore the
    204      * data file data for these characters.
    205      * The Turkic dotless i and dotted I with their case mapping conditions
    206      * and case folding option make the related characters behave specially.
    207      * This code matches their closure behavior to their case folding behavior.
    208      */
    209 
    210     switch(c) {
    211     case 0x49:
    212         /* regular i and I are in one equivalence class */
    213         sa->add(sa->set, 0x69);
    214         return;
    215     case 0x69:
    216         sa->add(sa->set, 0x49);
    217         return;
    218     case 0x130:
    219         /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
    220         sa->addString(sa->set, iDot, 2);
    221         return;
    222     case 0x131:
    223         /* dotless i is in a class by itself */
    224         return;
    225     default:
    226         /* otherwise use the data file data */
    227         break;
    228     }
    229 
    230     props=UTRIE2_GET16(&csp->trie, c);
    231     if(!PROPS_HAS_EXCEPTION(props)) {
    232         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
    233             /* add the one simple case mapping, no matter what type it is */
    234             int32_t delta=UCASE_GET_DELTA(props);
    235             if(delta!=0) {
    236                 sa->add(sa->set, c+delta);
    237             }
    238         }
    239     } else {
    240         /*
    241          * c has exceptions, so there may be multiple simple and/or
    242          * full case mappings. Add them all.
    243          */
    244         const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
    245         const UChar *closure;
    246         uint16_t excWord=*pe++;
    247         int32_t idx, closureLength, fullLength, length;
    248 
    249         pe0=pe;
    250 
    251         /* add all simple case mappings */
    252         for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
    253             if(HAS_SLOT(excWord, idx)) {
    254                 pe=pe0;
    255                 GET_SLOT_VALUE(excWord, idx, pe, c);
    256                 sa->add(sa->set, c);
    257             }
    258         }
    259 
    260         /* get the closure string pointer & length */
    261         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
    262             pe=pe0;
    263             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
    264             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
    265             closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
    266         } else {
    267             closureLength=0;
    268             closure=NULL;
    269         }
    270 
    271         /* add the full case folding */
    272         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
    273             pe=pe0;
    274             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
    275 
    276             /* start of full case mapping strings */
    277             ++pe;
    278 
    279             fullLength&=0xffff; /* bits 16 and higher are reserved */
    280 
    281             /* skip the lowercase result string */
    282             pe+=fullLength&UCASE_FULL_LOWER;
    283             fullLength>>=4;
    284 
    285             /* add the full case folding string */
    286             length=fullLength&0xf;
    287             if(length!=0) {
    288                 sa->addString(sa->set, (const UChar *)pe, length);
    289                 pe+=length;
    290             }
    291 
    292             /* skip the uppercase and titlecase strings */
    293             fullLength>>=4;
    294             pe+=fullLength&0xf;
    295             fullLength>>=4;
    296             pe+=fullLength;
    297 
    298             closure=(const UChar *)pe; /* behind full case mappings */
    299         }
    300 
    301         /* add each code point in the closure string */
    302         for(idx=0; idx<closureLength;) {
    303             U16_NEXT_UNSAFE(closure, idx, c);
    304             sa->add(sa->set, c);
    305         }
    306     }
    307 }
    308 
    309 /*
    310  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
    311  * must be length>0 and max>0 and length<=max
    312  */
    313 static inline int32_t
    314 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
    315     int32_t c1, c2;
    316 
    317     max-=length; /* we require length<=max, so no need to decrement max in the loop */
    318     do {
    319         c1=*s++;
    320         c2=*t++;
    321         if(c2==0) {
    322             return 1; /* reached the end of t but not of s */
    323         }
    324         c1-=c2;
    325         if(c1!=0) {
    326             return c1; /* return difference result */
    327         }
    328     } while(--length>0);
    329     /* ends with length==0 */
    330 
    331     if(max==0 || *t==0) {
    332         return 0; /* equal to length of both strings */
    333     } else {
    334         return -max; /* return lengh difference */
    335     }
    336 }
    337 
    338 U_CFUNC UBool U_EXPORT2
    339 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
    340     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
    341 
    342     if(csp->unfold==NULL || s==NULL) {
    343         return FALSE; /* no reverse case folding data, or no string */
    344     }
    345     if(length<=1) {
    346         /* the string is too short to find any match */
    347         /*
    348          * more precise would be:
    349          * if(!u_strHasMoreChar32Than(s, length, 1))
    350          * but this does not make much practical difference because
    351          * a single supplementary code point would just not be found
    352          */
    353         return FALSE;
    354     }
    355 
    356     const uint16_t *unfold=csp->unfold;
    357     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
    358     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
    359     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
    360     unfold+=unfoldRowWidth;
    361 
    362     if(length>unfoldStringWidth) {
    363         /* the string is too long to find any match */
    364         return FALSE;
    365     }
    366 
    367     /* do a binary search for the string */
    368     start=0;
    369     limit=unfoldRows;
    370     while(start<limit) {
    371         i=(start+limit)/2;
    372         const UChar *p=reinterpret_cast<const UChar *>(unfold+(i*unfoldRowWidth));
    373         result=strcmpMax(s, length, p, unfoldStringWidth);
    374 
    375         if(result==0) {
    376             /* found the string: add each code point, and its case closure */
    377             UChar32 c;
    378 
    379             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
    380                 U16_NEXT_UNSAFE(p, i, c);
    381                 sa->add(sa->set, c);
    382                 ucase_addCaseClosure(csp, c, sa);
    383             }
    384             return TRUE;
    385         } else if(result<0) {
    386             limit=i;
    387         } else /* result>0 */ {
    388             start=i+1;
    389         }
    390     }
    391 
    392     return FALSE; /* string not found */
    393 }
    394 
    395 U_NAMESPACE_BEGIN
    396 
    397 FullCaseFoldingIterator::FullCaseFoldingIterator()
    398         : unfold(reinterpret_cast<const UChar *>(ucase_props_singleton.unfold)),
    399           unfoldRows(unfold[UCASE_UNFOLD_ROWS]),
    400           unfoldRowWidth(unfold[UCASE_UNFOLD_ROW_WIDTH]),
    401           unfoldStringWidth(unfold[UCASE_UNFOLD_STRING_WIDTH]),
    402           currentRow(0),
    403           rowCpIndex(unfoldStringWidth) {
    404     unfold+=unfoldRowWidth;
    405 }
    406 
    407 UChar32
    408 FullCaseFoldingIterator::next(UnicodeString &full) {
    409     // Advance past the last-delivered code point.
    410     const UChar *p=unfold+(currentRow*unfoldRowWidth);
    411     if(rowCpIndex>=unfoldRowWidth || p[rowCpIndex]==0) {
    412         ++currentRow;
    413         p+=unfoldRowWidth;
    414         rowCpIndex=unfoldStringWidth;
    415     }
    416     if(currentRow>=unfoldRows) { return U_SENTINEL; }
    417     // Set "full" to the NUL-terminated string in the first unfold column.
    418     int32_t length=unfoldStringWidth;
    419     while(length>0 && p[length-1]==0) { --length; }
    420     full.setTo(FALSE, p, length);
    421     // Return the code point.
    422     UChar32 c;
    423     U16_NEXT_UNSAFE(p, rowCpIndex, c);
    424     return c;
    425 }
    426 
    427 U_NAMESPACE_END
    428 
    429 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
    430 U_CAPI int32_t U_EXPORT2
    431 ucase_getType(const UCaseProps *csp, UChar32 c) {
    432     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    433     return UCASE_GET_TYPE(props);
    434 }
    435 
    436 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
    437 U_CAPI int32_t U_EXPORT2
    438 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
    439     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    440     return UCASE_GET_TYPE_AND_IGNORABLE(props);
    441 }
    442 
    443 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
    444 static inline int32_t
    445 getDotType(const UCaseProps *csp, UChar32 c) {
    446     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    447     if(!PROPS_HAS_EXCEPTION(props)) {
    448         return props&UCASE_DOT_MASK;
    449     } else {
    450         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
    451         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
    452     }
    453 }
    454 
    455 U_CAPI UBool U_EXPORT2
    456 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
    457     return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
    458 }
    459 
    460 U_CAPI UBool U_EXPORT2
    461 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
    462     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    463     return (UBool)((props&UCASE_SENSITIVE)!=0);
    464 }
    465 
    466 /* string casing ------------------------------------------------------------ */
    467 
    468 /*
    469  * These internal functions form the core of string case mappings.
    470  * They map single code points to result code points or strings and take
    471  * all necessary conditions (context, locale ID, options) into account.
    472  *
    473  * They do not iterate over the source or write to the destination
    474  * so that the same functions are useful for non-standard string storage,
    475  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
    476  * For the same reason, the "surrounding text" context is passed in as a
    477  * UCaseContextIterator which does not make any assumptions about
    478  * the underlying storage.
    479  *
    480  * This section contains helper functions that check for conditions
    481  * in the input text surrounding the current code point
    482  * according to SpecialCasing.txt.
    483  *
    484  * Each helper function gets the index
    485  * - after the current code point if it looks at following text
    486  * - before the current code point if it looks at preceding text
    487  *
    488  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
    489  *
    490  * Final_Sigma
    491  *   C is preceded by a sequence consisting of
    492  *     a cased letter and a case-ignorable sequence,
    493  *   and C is not followed by a sequence consisting of
    494  *     an ignorable sequence and then a cased letter.
    495  *
    496  * More_Above
    497  *   C is followed by one or more characters of combining class 230 (ABOVE)
    498  *   in the combining character sequence.
    499  *
    500  * After_Soft_Dotted
    501  *   The last preceding character with combining class of zero before C
    502  *   was Soft_Dotted,
    503  *   and there is no intervening combining character class 230 (ABOVE).
    504  *
    505  * Before_Dot
    506  *   C is followed by combining dot above (U+0307).
    507  *   Any sequence of characters with a combining class that is neither 0 nor 230
    508  *   may intervene between the current character and the combining dot above.
    509  *
    510  * The erratum from 2002-10-31 adds the condition
    511  *
    512  * After_I
    513  *   The last preceding base character was an uppercase I, and there is no
    514  *   intervening combining character class 230 (ABOVE).
    515  *
    516  *   (See Jitterbug 2344 and the comments on After_I below.)
    517  *
    518  * Helper definitions in Unicode 3.2 UAX 21:
    519  *
    520  * D1. A character C is defined to be cased
    521  *     if it meets any of the following criteria:
    522  *
    523  *   - The general category of C is Titlecase Letter (Lt)
    524  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
    525  *   - Given D = NFD(C), then it is not the case that:
    526  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
    527  *     (This third criterium does not add any characters to the list
    528  *      for Unicode 3.2. Ignored.)
    529  *
    530  * D2. A character C is defined to be case-ignorable
    531  *     if it meets either of the following criteria:
    532  *
    533  *   - The general category of C is
    534  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
    535  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
    536  *   - C is one of the following characters
    537  *     U+0027 APOSTROPHE
    538  *     U+00AD SOFT HYPHEN (SHY)
    539  *     U+2019 RIGHT SINGLE QUOTATION MARK
    540  *            (the preferred character for apostrophe)
    541  *
    542  * D3. A case-ignorable sequence is a sequence of
    543  *     zero or more case-ignorable characters.
    544  */
    545 
    546 #define is_a(c) ((c)=='a' || (c)=='A')
    547 #define is_d(c) ((c)=='d' || (c)=='D')
    548 #define is_e(c) ((c)=='e' || (c)=='E')
    549 #define is_i(c) ((c)=='i' || (c)=='I')
    550 #define is_l(c) ((c)=='l' || (c)=='L')
    551 #define is_n(c) ((c)=='n' || (c)=='N')
    552 #define is_r(c) ((c)=='r' || (c)=='R')
    553 #define is_t(c) ((c)=='t' || (c)=='T')
    554 #define is_u(c) ((c)=='u' || (c)=='U')
    555 #define is_z(c) ((c)=='z' || (c)=='Z')
    556 
    557 /* separator? */
    558 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
    559 
    560 /**
    561  * Requires non-NULL locale ID but otherwise does the equivalent of
    562  * checking for language codes as if uloc_getLanguage() were called:
    563  * Accepts both 2- and 3-letter codes and accepts case variants.
    564  */
    565 U_CFUNC int32_t
    566 ucase_getCaseLocale(const char *locale, int32_t *locCache) {
    567     int32_t result;
    568     char c;
    569 
    570     if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
    571         return result;
    572     }
    573 
    574     result=UCASE_LOC_ROOT;
    575 
    576     /*
    577      * This function used to use uloc_getLanguage(), but the current code
    578      * removes the dependency of this low-level code on uloc implementation code
    579      * and is faster because not the whole locale ID has to be
    580      * examined and copied/transformed.
    581      *
    582      * Because this code does not want to depend on uloc, the caller must
    583      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
    584      */
    585     c=*locale++;
    586     if(is_t(c)) {
    587         /* tr or tur? */
    588         c=*locale++;
    589         if(is_u(c)) {
    590             c=*locale++;
    591         }
    592         if(is_r(c)) {
    593             c=*locale;
    594             if(is_sep(c)) {
    595                 result=UCASE_LOC_TURKISH;
    596             }
    597         }
    598     } else if(is_a(c)) {
    599         /* az or aze? */
    600         c=*locale++;
    601         if(is_z(c)) {
    602             c=*locale++;
    603             if(is_e(c)) {
    604                 c=*locale;
    605             }
    606             if(is_sep(c)) {
    607                 result=UCASE_LOC_TURKISH;
    608             }
    609         }
    610     } else if(is_l(c)) {
    611         /* lt or lit? */
    612         c=*locale++;
    613         if(is_i(c)) {
    614             c=*locale++;
    615         }
    616         if(is_t(c)) {
    617             c=*locale;
    618             if(is_sep(c)) {
    619                 result=UCASE_LOC_LITHUANIAN;
    620             }
    621         }
    622     } else if(is_n(c)) {
    623         /* nl or nld? */
    624         c=*locale++;
    625         if(is_l(c)) {
    626             c=*locale++;
    627             if(is_d(c)) {
    628                 c=*locale;
    629             }
    630             if(is_sep(c)) {
    631                 result=UCASE_LOC_DUTCH;
    632             }
    633         }
    634     }
    635 
    636     if(locCache!=NULL) {
    637         *locCache=result;
    638     }
    639     return result;
    640 }
    641 
    642 /*
    643  * Is followed by
    644  *   {case-ignorable}* cased
    645  * ?
    646  * (dir determines looking forward/backward)
    647  * If a character is case-ignorable, it is skipped regardless of whether
    648  * it is also cased or not.
    649  */
    650 static UBool
    651 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
    652     UChar32 c;
    653 
    654     if(iter==NULL) {
    655         return FALSE;
    656     }
    657 
    658     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
    659         int32_t type=ucase_getTypeOrIgnorable(csp, c);
    660         if(type&4) {
    661             /* case-ignorable, continue with the loop */
    662         } else if(type!=UCASE_NONE) {
    663             return TRUE; /* followed by cased letter */
    664         } else {
    665             return FALSE; /* uncased and not case-ignorable */
    666         }
    667     }
    668 
    669     return FALSE; /* not followed by cased letter */
    670 }
    671 
    672 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
    673 static UBool
    674 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
    675     UChar32 c;
    676     int32_t dotType;
    677     int8_t dir;
    678 
    679     if(iter==NULL) {
    680         return FALSE;
    681     }
    682 
    683     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
    684         dotType=getDotType(csp, c);
    685         if(dotType==UCASE_SOFT_DOTTED) {
    686             return TRUE; /* preceded by TYPE_i */
    687         } else if(dotType!=UCASE_OTHER_ACCENT) {
    688             return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
    689         }
    690     }
    691 
    692     return FALSE; /* not preceded by TYPE_i */
    693 }
    694 
    695 /*
    696  * See Jitterbug 2344:
    697  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
    698  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
    699  * we made those releases compatible with Unicode 3.2 which had not fixed
    700  * a related bug in SpecialCasing.txt.
    701  *
    702  * From the Jitterbug 2344 text:
    703  * ... this bug is listed as a Unicode erratum
    704  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
    705  * <quote>
    706  * There are two errors in SpecialCasing.txt.
    707  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
    708  * 2. An incorrect context definition. Correct as follows:
    709  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
    710  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
    711  * ---
    712  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
    713  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
    714  * where the context After_I is defined as:
    715  * The last preceding base character was an uppercase I, and there is no
    716  * intervening combining character class 230 (ABOVE).
    717  * </quote>
    718  *
    719  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
    720  *
    721  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
    722  * # This matches the behavior of the canonically equivalent I-dot_above
    723  *
    724  * See also the description in this place in older versions of uchar.c (revision 1.100).
    725  *
    726  * Markus W. Scherer 2003-feb-15
    727  */
    728 
    729 /* Is preceded by base character 'I' with no intervening cc=230 ? */
    730 static UBool
    731 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
    732     UChar32 c;
    733     int32_t dotType;
    734     int8_t dir;
    735 
    736     if(iter==NULL) {
    737         return FALSE;
    738     }
    739 
    740     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
    741         if(c==0x49) {
    742             return TRUE; /* preceded by I */
    743         }
    744         dotType=getDotType(csp, c);
    745         if(dotType!=UCASE_OTHER_ACCENT) {
    746             return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
    747         }
    748     }
    749 
    750     return FALSE; /* not preceded by I */
    751 }
    752 
    753 /* Is followed by one or more cc==230 ? */
    754 static UBool
    755 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
    756     UChar32 c;
    757     int32_t dotType;
    758     int8_t dir;
    759 
    760     if(iter==NULL) {
    761         return FALSE;
    762     }
    763 
    764     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
    765         dotType=getDotType(csp, c);
    766         if(dotType==UCASE_ABOVE) {
    767             return TRUE; /* at least one cc==230 following */
    768         } else if(dotType!=UCASE_OTHER_ACCENT) {
    769             return FALSE; /* next base character, no more cc==230 following */
    770         }
    771     }
    772 
    773     return FALSE; /* no more cc==230 following */
    774 }
    775 
    776 /* Is followed by a dot above (without cc==230 in between) ? */
    777 static UBool
    778 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
    779     UChar32 c;
    780     int32_t dotType;
    781     int8_t dir;
    782 
    783     if(iter==NULL) {
    784         return FALSE;
    785     }
    786 
    787     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
    788         if(c==0x307) {
    789             return TRUE;
    790         }
    791         dotType=getDotType(csp, c);
    792         if(dotType!=UCASE_OTHER_ACCENT) {
    793             return FALSE; /* next base character or cc==230 in between */
    794         }
    795     }
    796 
    797     return FALSE; /* no dot above following */
    798 }
    799 
    800 U_CAPI int32_t U_EXPORT2
    801 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
    802                   UCaseContextIterator *iter, void *context,
    803                   const UChar **pString,
    804                   const char *locale, int32_t *locCache)
    805 {
    806     UChar32 result=c;
    807     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    808     if(!PROPS_HAS_EXCEPTION(props)) {
    809         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
    810             result=c+UCASE_GET_DELTA(props);
    811         }
    812     } else {
    813         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
    814         uint16_t excWord=*pe++;
    815         int32_t full;
    816 
    817         pe2=pe;
    818 
    819         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
    820             /* use hardcoded conditions and mappings */
    821             int32_t loc=ucase_getCaseLocale(locale, locCache);
    822 
    823             /*
    824              * Test for conditional mappings first
    825              *   (otherwise the unconditional default mappings are always taken),
    826              * then test for characters that have unconditional mappings in SpecialCasing.txt,
    827              * then get the UnicodeData.txt mappings.
    828              */
    829             if( loc==UCASE_LOC_LITHUANIAN &&
    830                     /* base characters, find accents above */
    831                     (((c==0x49 || c==0x4a || c==0x12e) &&
    832                         isFollowedByMoreAbove(csp, iter, context)) ||
    833                     /* precomposed with accent above, no need to find one */
    834                     (c==0xcc || c==0xcd || c==0x128))
    835             ) {
    836                 /*
    837                     # Lithuanian
    838 
    839                     # Lithuanian retains the dot in a lowercase i when followed by accents.
    840 
    841                     # Introduce an explicit dot above when lowercasing capital I's and J's
    842                     # whenever there are more accents above.
    843                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
    844 
    845                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
    846                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
    847                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
    848                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
    849                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
    850                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
    851                  */
    852                 switch(c) {
    853                 case 0x49:  /* LATIN CAPITAL LETTER I */
    854                     *pString=iDot;
    855                     return 2;
    856                 case 0x4a:  /* LATIN CAPITAL LETTER J */
    857                     *pString=jDot;
    858                     return 2;
    859                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
    860                     *pString=iOgonekDot;
    861                     return 2;
    862                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
    863                     *pString=iDotGrave;
    864                     return 3;
    865                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
    866                     *pString=iDotAcute;
    867                     return 3;
    868                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
    869                     *pString=iDotTilde;
    870                     return 3;
    871                 default:
    872                     return 0; /* will not occur */
    873                 }
    874             /* # Turkish and Azeri */
    875             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
    876                 /*
    877                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
    878                     # The following rules handle those cases.
    879 
    880                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
    881                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
    882                  */
    883                 return 0x69;
    884             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
    885                 /*
    886                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
    887                     # This matches the behavior of the canonically equivalent I-dot_above
    888 
    889                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
    890                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
    891                  */
    892                 return 0; /* remove the dot (continue without output) */
    893             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
    894                 /*
    895                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
    896 
    897                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
    898                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
    899                  */
    900                 return 0x131;
    901             } else if(c==0x130) {
    902                 /*
    903                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
    904 
    905                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
    906                  */
    907                 *pString=iDot;
    908                 return 2;
    909             } else if(  c==0x3a3 &&
    910                         !isFollowedByCasedLetter(csp, iter, context, 1) &&
    911                         isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
    912             ) {
    913                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
    914                 /*
    915                     # Special case for final form of sigma
    916 
    917                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
    918                  */
    919                 return 0x3c2; /* greek small final sigma */
    920             } else {
    921                 /* no known conditional special case mapping, use a normal mapping */
    922             }
    923         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
    924             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
    925             full&=UCASE_FULL_LOWER;
    926             if(full!=0) {
    927                 /* set the output pointer to the lowercase mapping */
    928                 *pString=reinterpret_cast<const UChar *>(pe+1);
    929 
    930                 /* return the string length */
    931                 return full;
    932             }
    933         }
    934 
    935         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
    936             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
    937         }
    938     }
    939 
    940     return (result==c) ? ~result : result;
    941 }
    942 
    943 /* internal */
    944 static int32_t
    945 toUpperOrTitle(const UCaseProps *csp, UChar32 c,
    946                UCaseContextIterator *iter, void *context,
    947                const UChar **pString,
    948                const char *locale, int32_t *locCache,
    949                UBool upperNotTitle) {
    950     UChar32 result=c;
    951     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    952     if(!PROPS_HAS_EXCEPTION(props)) {
    953         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
    954             result=c+UCASE_GET_DELTA(props);
    955         }
    956     } else {
    957         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
    958         uint16_t excWord=*pe++;
    959         int32_t full, idx;
    960 
    961         pe2=pe;
    962 
    963         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
    964             /* use hardcoded conditions and mappings */
    965             int32_t loc=ucase_getCaseLocale(locale, locCache);
    966 
    967             if(loc==UCASE_LOC_TURKISH && c==0x69) {
    968                 /*
    969                     # Turkish and Azeri
    970 
    971                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
    972                     # The following rules handle those cases.
    973 
    974                     # When uppercasing, i turns into a dotted capital I
    975 
    976                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
    977                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
    978                 */
    979                 return 0x130;
    980             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
    981                 /*
    982                     # Lithuanian
    983 
    984                     # Lithuanian retains the dot in a lowercase i when followed by accents.
    985 
    986                     # Remove DOT ABOVE after "i" with upper or titlecase
    987 
    988                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
    989                  */
    990                 return 0; /* remove the dot (continue without output) */
    991             } else {
    992                 /* no known conditional special case mapping, use a normal mapping */
    993             }
    994         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
    995             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
    996 
    997             /* start of full case mapping strings */
    998             ++pe;
    999 
   1000             /* skip the lowercase and case-folding result strings */
   1001             pe+=full&UCASE_FULL_LOWER;
   1002             full>>=4;
   1003             pe+=full&0xf;
   1004             full>>=4;
   1005 
   1006             if(upperNotTitle) {
   1007                 full&=0xf;
   1008             } else {
   1009                 /* skip the uppercase result string */
   1010                 pe+=full&0xf;
   1011                 full=(full>>4)&0xf;
   1012             }
   1013 
   1014             if(full!=0) {
   1015                 /* set the output pointer to the result string */
   1016                 *pString=reinterpret_cast<const UChar *>(pe);
   1017 
   1018                 /* return the string length */
   1019                 return full;
   1020             }
   1021         }
   1022 
   1023         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
   1024             idx=UCASE_EXC_TITLE;
   1025         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
   1026             /* here, titlecase is same as uppercase */
   1027             idx=UCASE_EXC_UPPER;
   1028         } else {
   1029             return ~c;
   1030         }
   1031         GET_SLOT_VALUE(excWord, idx, pe2, result);
   1032     }
   1033 
   1034     return (result==c) ? ~result : result;
   1035 }
   1036 
   1037 U_CAPI int32_t U_EXPORT2
   1038 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
   1039                   UCaseContextIterator *iter, void *context,
   1040                   const UChar **pString,
   1041                   const char *locale, int32_t *locCache) {
   1042     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
   1043 }
   1044 
   1045 U_CAPI int32_t U_EXPORT2
   1046 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
   1047                   UCaseContextIterator *iter, void *context,
   1048                   const UChar **pString,
   1049                   const char *locale, int32_t *locCache) {
   1050     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
   1051 }
   1052 
   1053 /* case folding ------------------------------------------------------------- */
   1054 
   1055 /*
   1056  * Case folding is similar to lowercasing.
   1057  * The result may be a simple mapping, i.e., a single code point, or
   1058  * a full mapping, i.e., a string.
   1059  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
   1060  * then only the lowercase mapping is stored.
   1061  *
   1062  * Some special cases are hardcoded because their conditions cannot be
   1063  * parsed and processed from CaseFolding.txt.
   1064  *
   1065  * Unicode 3.2 CaseFolding.txt specifies for its status field:
   1066 
   1067 # C: common case folding, common mappings shared by both simple and full mappings.
   1068 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
   1069 # S: simple case folding, mappings to single characters where different from F.
   1070 # T: special case for uppercase I and dotted uppercase I
   1071 #    - For non-Turkic languages, this mapping is normally not used.
   1072 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
   1073 #
   1074 # Usage:
   1075 #  A. To do a simple case folding, use the mappings with status C + S.
   1076 #  B. To do a full case folding, use the mappings with status C + F.
   1077 #
   1078 #    The mappings with status T can be used or omitted depending on the desired case-folding
   1079 #    behavior. (The default option is to exclude them.)
   1080 
   1081  * Unicode 3.2 has 'T' mappings as follows:
   1082 
   1083 0049; T; 0131; # LATIN CAPITAL LETTER I
   1084 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1085 
   1086  * while the default mappings for these code points are:
   1087 
   1088 0049; C; 0069; # LATIN CAPITAL LETTER I
   1089 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1090 
   1091  * U+0130 has no simple case folding (simple-case-folds to itself).
   1092  */
   1093 
   1094 /* return the simple case folding mapping for c */
   1095 U_CAPI UChar32 U_EXPORT2
   1096 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
   1097     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   1098     if(!PROPS_HAS_EXCEPTION(props)) {
   1099         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
   1100             c+=UCASE_GET_DELTA(props);
   1101         }
   1102     } else {
   1103         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
   1104         uint16_t excWord=*pe++;
   1105         int32_t idx;
   1106         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
   1107             /* special case folding mappings, hardcoded */
   1108             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
   1109                 /* default mappings */
   1110                 if(c==0x49) {
   1111                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
   1112                     return 0x69;
   1113                 } else if(c==0x130) {
   1114                     /* no simple case folding for U+0130 */
   1115                     return c;
   1116                 }
   1117             } else {
   1118                 /* Turkic mappings */
   1119                 if(c==0x49) {
   1120                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
   1121                     return 0x131;
   1122                 } else if(c==0x130) {
   1123                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1124                     return 0x69;
   1125                 }
   1126             }
   1127         }
   1128         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
   1129             idx=UCASE_EXC_FOLD;
   1130         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
   1131             idx=UCASE_EXC_LOWER;
   1132         } else {
   1133             return c;
   1134         }
   1135         GET_SLOT_VALUE(excWord, idx, pe, c);
   1136     }
   1137     return c;
   1138 }
   1139 
   1140 /*
   1141  * Issue for canonical caseless match (UAX #21):
   1142  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
   1143  * canonical equivalence, unlike default-option casefolding.
   1144  * For example, I-grave and I + grave fold to strings that are not canonically
   1145  * equivalent.
   1146  * For more details, see the comment in unorm_compare() in unorm.cpp
   1147  * and the intermediate prototype changes for Jitterbug 2021.
   1148  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
   1149  *
   1150  * This did not get fixed because it appears that it is not possible to fix
   1151  * it for uppercase and lowercase characters (I-grave vs. i-grave)
   1152  * together in a way that they still fold to common result strings.
   1153  */
   1154 
   1155 U_CAPI int32_t U_EXPORT2
   1156 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
   1157                     const UChar **pString,
   1158                     uint32_t options)
   1159 {
   1160     UChar32 result=c;
   1161     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   1162     if(!PROPS_HAS_EXCEPTION(props)) {
   1163         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
   1164             result=c+UCASE_GET_DELTA(props);
   1165         }
   1166     } else {
   1167         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
   1168         uint16_t excWord=*pe++;
   1169         int32_t full, idx;
   1170 
   1171         pe2=pe;
   1172 
   1173         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
   1174             /* use hardcoded conditions and mappings */
   1175             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
   1176                 /* default mappings */
   1177                 if(c==0x49) {
   1178                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
   1179                     return 0x69;
   1180                 } else if(c==0x130) {
   1181                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1182                     *pString=iDot;
   1183                     return 2;
   1184                 }
   1185             } else {
   1186                 /* Turkic mappings */
   1187                 if(c==0x49) {
   1188                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
   1189                     return 0x131;
   1190                 } else if(c==0x130) {
   1191                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1192                     return 0x69;
   1193                 }
   1194             }
   1195         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
   1196             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
   1197 
   1198             /* start of full case mapping strings */
   1199             ++pe;
   1200 
   1201             /* skip the lowercase result string */
   1202             pe+=full&UCASE_FULL_LOWER;
   1203             full=(full>>4)&0xf;
   1204 
   1205             if(full!=0) {
   1206                 /* set the output pointer to the result string */
   1207                 *pString=reinterpret_cast<const UChar *>(pe);
   1208 
   1209                 /* return the string length */
   1210                 return full;
   1211             }
   1212         }
   1213 
   1214         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
   1215             idx=UCASE_EXC_FOLD;
   1216         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
   1217             idx=UCASE_EXC_LOWER;
   1218         } else {
   1219             return ~c;
   1220         }
   1221         GET_SLOT_VALUE(excWord, idx, pe2, result);
   1222     }
   1223 
   1224     return (result==c) ? ~result : result;
   1225 }
   1226 
   1227 /* case mapping properties API ---------------------------------------------- */
   1228 
   1229 #define GET_CASE_PROPS() &ucase_props_singleton
   1230 
   1231 /* public API (see uchar.h) */
   1232 
   1233 U_CAPI UBool U_EXPORT2
   1234 u_isULowercase(UChar32 c) {
   1235     return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
   1236 }
   1237 
   1238 U_CAPI UBool U_EXPORT2
   1239 u_isUUppercase(UChar32 c) {
   1240     return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
   1241 }
   1242 
   1243 /* Transforms the Unicode character to its lower case equivalent.*/
   1244 U_CAPI UChar32 U_EXPORT2
   1245 u_tolower(UChar32 c) {
   1246     return ucase_tolower(GET_CASE_PROPS(), c);
   1247 }
   1248 
   1249 /* Transforms the Unicode character to its upper case equivalent.*/
   1250 U_CAPI UChar32 U_EXPORT2
   1251 u_toupper(UChar32 c) {
   1252     return ucase_toupper(GET_CASE_PROPS(), c);
   1253 }
   1254 
   1255 /* Transforms the Unicode character to its title case equivalent.*/
   1256 U_CAPI UChar32 U_EXPORT2
   1257 u_totitle(UChar32 c) {
   1258     return ucase_totitle(GET_CASE_PROPS(), c);
   1259 }
   1260 
   1261 /* return the simple case folding mapping for c */
   1262 U_CAPI UChar32 U_EXPORT2
   1263 u_foldCase(UChar32 c, uint32_t options) {
   1264     return ucase_fold(GET_CASE_PROPS(), c, options);
   1265 }
   1266 
   1267 U_CFUNC int32_t U_EXPORT2
   1268 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
   1269     /* case mapping properties */
   1270     const UChar *resultString;
   1271     int32_t locCache;
   1272     const UCaseProps *csp=GET_CASE_PROPS();
   1273     if(csp==NULL) {
   1274         return FALSE;
   1275     }
   1276     switch(which) {
   1277     case UCHAR_LOWERCASE:
   1278         return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
   1279     case UCHAR_UPPERCASE:
   1280         return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
   1281     case UCHAR_SOFT_DOTTED:
   1282         return ucase_isSoftDotted(csp, c);
   1283     case UCHAR_CASE_SENSITIVE:
   1284         return ucase_isCaseSensitive(csp, c);
   1285     case UCHAR_CASED:
   1286         return (UBool)(UCASE_NONE!=ucase_getType(csp, c));
   1287     case UCHAR_CASE_IGNORABLE:
   1288         return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2);
   1289     /*
   1290      * Note: The following Changes_When_Xyz are defined as testing whether
   1291      * the NFD form of the input changes when Xyz-case-mapped.
   1292      * However, this simpler implementation of these properties,
   1293      * ignoring NFD, passes the tests.
   1294      * The implementation needs to be changed if the tests start failing.
   1295      * When that happens, optimizations should be used to work with the
   1296      * per-single-code point ucase_toFullXyz() functions unless
   1297      * the NFD form has more than one code point,
   1298      * and the property starts set needs to be the union of the
   1299      * start sets for normalization and case mappings.
   1300      */
   1301     case UCHAR_CHANGES_WHEN_LOWERCASED:
   1302         locCache=UCASE_LOC_ROOT;
   1303         return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
   1304     case UCHAR_CHANGES_WHEN_UPPERCASED:
   1305         locCache=UCASE_LOC_ROOT;
   1306         return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
   1307     case UCHAR_CHANGES_WHEN_TITLECASED:
   1308         locCache=UCASE_LOC_ROOT;
   1309         return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
   1310     /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
   1311     case UCHAR_CHANGES_WHEN_CASEMAPPED:
   1312         locCache=UCASE_LOC_ROOT;
   1313         return (UBool)(
   1314             ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
   1315             ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
   1316             ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
   1317     default:
   1318         return FALSE;
   1319     }
   1320 }
   1321