Home | History | Annotate | Download | only in common
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2004-2009, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucase.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2004aug30
     14 *   created by: Markus W. Scherer
     15 *
     16 *   Low-level Unicode character/string case mapping code.
     17 *   Much code moved here (and modified) from uchar.c.
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 #include "unicode/uset.h"
     22 #include "unicode/udata.h" /* UDataInfo */
     23 #include "ucmndata.h" /* DataHeader */
     24 #include "udatamem.h"
     25 #include "umutex.h"
     26 #include "uassert.h"
     27 #include "cmemory.h"
     28 #include "utrie2.h"
     29 #include "ucase.h"
     30 #include "ucln_cmn.h"
     31 
     32 struct UCaseProps {
     33     UDataMemory *mem;
     34     const int32_t *indexes;
     35     const uint16_t *exceptions;
     36     const UChar *unfold;
     37 
     38     UTrie2 trie;
     39     uint8_t formatVersion[4];
     40 };
     41 
     42 /* data loading etc. -------------------------------------------------------- */
     43 
     44 #if UCASE_HARDCODE_DATA
     45 
     46 /* ucase_props_data.c is machine-generated by gencase --csource */
     47 #include "ucase_props_data.c"
     48 
     49 #else
     50 
     51 static UBool U_CALLCONV
     52 isAcceptable(void *context,
     53              const char *type, const char *name,
     54              const UDataInfo *pInfo) {
     55     if(
     56         pInfo->size>=20 &&
     57         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
     58         pInfo->charsetFamily==U_CHARSET_FAMILY &&
     59         pInfo->dataFormat[0]==UCASE_FMT_0 &&    /* dataFormat="cAsE" */
     60         pInfo->dataFormat[1]==UCASE_FMT_1 &&
     61         pInfo->dataFormat[2]==UCASE_FMT_2 &&
     62         pInfo->dataFormat[3]==UCASE_FMT_3 &&
     63         pInfo->formatVersion[0]==1 &&
     64         pInfo->formatVersion[2]==UTRIE_SHIFT &&
     65         pInfo->formatVersion[3]==UTRIE_INDEX_SHIFT
     66     ) {
     67         UCaseProps *csp=(UCaseProps *)context;
     68         uprv_memcpy(csp->formatVersion, pInfo->formatVersion, 4);
     69         return TRUE;
     70     } else {
     71         return FALSE;
     72     }
     73 }
     74 
     75 static UCaseProps *
     76 ucase_openData(UCaseProps *cspProto,
     77                const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
     78     UCaseProps *csp;
     79     int32_t size;
     80 
     81     cspProto->indexes=(const int32_t *)bin;
     82     if( (length>=0 && length<16*4) ||
     83         cspProto->indexes[UCASE_IX_INDEX_TOP]<16
     84     ) {
     85         /* length or indexes[] too short for minimum indexes[] length of 16 */
     86         *pErrorCode=U_INVALID_FORMAT_ERROR;
     87         return NULL;
     88     }
     89     size=cspProto->indexes[UCASE_IX_INDEX_TOP]*4;
     90     if(length>=0) {
     91         if(length>=size && length>=cspProto->indexes[UCASE_IX_LENGTH]) {
     92             length-=size;
     93         } else {
     94             /* length too short for indexes[] or for the whole data length */
     95             *pErrorCode=U_INVALID_FORMAT_ERROR;
     96             return NULL;
     97         }
     98     }
     99     bin+=size;
    100     /* from here on, assume that the sizes of the items fit into the total length */
    101 
    102     /* unserialize the trie, after indexes[] */
    103     size=cspProto->indexes[UCASE_IX_TRIE_SIZE];
    104     utrie_unserialize(&cspProto->trie, bin, size, pErrorCode);
    105     if(U_FAILURE(*pErrorCode)) {
    106         return NULL;
    107     }
    108     bin+=size;
    109 
    110     /* get exceptions[] */
    111     size=2*cspProto->indexes[UCASE_IX_EXC_LENGTH];
    112     cspProto->exceptions=(const uint16_t *)bin;
    113     bin+=size;
    114 
    115     /* get unfold[] */
    116     size=2*cspProto->indexes[UCASE_IX_UNFOLD_LENGTH];
    117     if(size!=0) {
    118         cspProto->unfold=(const UChar *)bin;
    119         bin+=size;
    120     } else {
    121         cspProto->unfold=NULL;
    122     }
    123 
    124     /* allocate, copy, and return the new UCaseProps */
    125     csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps));
    126     if(csp==NULL) {
    127         *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    128         return NULL;
    129     } else {
    130         uprv_memcpy(csp, cspProto, sizeof(UCaseProps));
    131         return csp;
    132     }
    133 }
    134 
    135 U_CAPI UCaseProps * U_EXPORT2
    136 ucase_open(UErrorCode *pErrorCode) {
    137     UCaseProps cspProto={ NULL }, *csp;
    138 
    139     cspProto.mem=udata_openChoice(NULL, UCASE_DATA_TYPE, UCASE_DATA_NAME, isAcceptable, &cspProto, pErrorCode);
    140     if(U_FAILURE(*pErrorCode)) {
    141         return NULL;
    142     }
    143 
    144     csp=ucase_openData(
    145             &cspProto,
    146             udata_getMemory(cspProto.mem),
    147             udata_getLength(cspProto.mem),
    148             pErrorCode);
    149     if(U_FAILURE(*pErrorCode)) {
    150         udata_close(cspProto.mem);
    151         return NULL;
    152     } else {
    153         return csp;
    154     }
    155 }
    156 
    157 U_CAPI UCaseProps * U_EXPORT2
    158 ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode) {
    159     UCaseProps cspProto={ NULL };
    160     const DataHeader *hdr;
    161 
    162     if(U_FAILURE(*pErrorCode)) {
    163         return NULL;
    164     }
    165     if(bin==NULL) {
    166         *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
    167         return NULL;
    168     }
    169 
    170     /* check the header */
    171     if(length>=0 && length<20) {
    172         *pErrorCode=U_INVALID_FORMAT_ERROR;
    173         return NULL;
    174     }
    175     hdr=(const DataHeader *)bin;
    176     if(
    177         !(hdr->dataHeader.magic1==0xda && hdr->dataHeader.magic2==0x27 &&
    178           hdr->info.isBigEndian==U_IS_BIG_ENDIAN &&
    179           isAcceptable(&cspProto, UCASE_DATA_TYPE, UCASE_DATA_NAME, &hdr->info))
    180     ) {
    181         *pErrorCode=U_INVALID_FORMAT_ERROR;
    182         return NULL;
    183     }
    184 
    185     bin+=hdr->dataHeader.headerSize;
    186     if(length>=0) {
    187         length-=hdr->dataHeader.headerSize;
    188     }
    189     return ucase_openData(&cspProto, bin, length, pErrorCode);
    190 }
    191 
    192 #endif
    193 
    194 U_CAPI void U_EXPORT2
    195 ucase_close(UCaseProps *csp) {
    196     if(csp!=NULL) {
    197 #if !UCASE_HARDCODE_DATA
    198         udata_close(csp->mem);
    199 #endif
    200         uprv_free(csp);
    201     }
    202 }
    203 
    204 /* UCaseProps singleton ----------------------------------------------------- */
    205 
    206 #if !UCASE_HARDCODE_DATA
    207 static UCaseProps *gCsp=NULL;
    208 static UCaseProps *gCspDummy=NULL;
    209 static UErrorCode gErrorCode=U_ZERO_ERROR;
    210 static int8_t gHaveData=0;
    211 #endif
    212 
    213 #if !UCASE_HARDCODE_DATA
    214 static UBool U_CALLCONV ucase_cleanup(void) {
    215     ucase_close(gCsp);
    216     gCsp=NULL;
    217     ucase_close(gCspDummy);
    218     gCspDummy=NULL;
    219     gErrorCode=U_ZERO_ERROR;
    220     gHaveData=0;
    221     return TRUE;
    222 }
    223 #endif
    224 
    225 U_CAPI const UCaseProps * U_EXPORT2
    226 ucase_getSingleton(UErrorCode *pErrorCode) {
    227 #if UCASE_HARDCODE_DATA
    228     if(U_FAILURE(*pErrorCode)) {
    229         return NULL;
    230     }
    231     return &ucase_props_singleton;
    232 #else
    233     int8_t haveData;
    234 
    235     if(U_FAILURE(*pErrorCode)) {
    236         return NULL;
    237     }
    238 
    239     UMTX_CHECK(NULL, gHaveData, haveData);
    240 
    241     if(haveData>0) {
    242         /* data was loaded */
    243         return gCsp;
    244     } else if(haveData<0) {
    245         /* data loading failed */
    246         *pErrorCode=gErrorCode;
    247         return NULL;
    248     } else /* haveData==0 */ {
    249         /* load the data */
    250         UCaseProps *csp=ucase_open(pErrorCode);
    251         if(U_FAILURE(*pErrorCode)) {
    252             gHaveData=-1;
    253             gErrorCode=*pErrorCode;
    254             return NULL;
    255         }
    256 
    257         /* set the static variables */
    258         umtx_lock(NULL);
    259         if(gCsp==NULL) {
    260             gCsp=csp;
    261             csp=NULL;
    262             gHaveData=1;
    263             ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup);
    264         }
    265         umtx_unlock(NULL);
    266 
    267         ucase_close(csp);
    268         return gCsp;
    269     }
    270 #endif
    271 }
    272 
    273 #if !UCASE_HARDCODE_DATA
    274 U_CAPI const UCaseProps * U_EXPORT2
    275 ucase_getDummy(UErrorCode *pErrorCode) {
    276     UCaseProps *csp;
    277 
    278     if(U_FAILURE(*pErrorCode)) {
    279         return NULL;
    280     }
    281 
    282     UMTX_CHECK(NULL, gCspDummy, csp);
    283 
    284     if(csp!=NULL) {
    285         /* the dummy object was already created */
    286         return csp;
    287     } else /* csp==NULL */ {
    288         /* create the dummy object */
    289         int32_t *indexes;
    290 
    291         csp=(UCaseProps *)uprv_malloc(sizeof(UCaseProps)+UCASE_IX_TOP*4+UTRIE_DUMMY_SIZE);
    292         if(csp==NULL) {
    293             *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
    294             return NULL;
    295         }
    296         uprv_memset(csp, 0, sizeof(UCaseProps)+UCASE_IX_TOP*4);
    297 
    298         csp->indexes=indexes=(int32_t *)(csp+1);
    299         indexes[UCASE_IX_INDEX_TOP]=UCASE_IX_TOP;
    300 
    301         indexes[UCASE_IX_TRIE_SIZE]=
    302             utrie_unserializeDummy(&csp->trie, indexes+UCASE_IX_TOP, UTRIE_DUMMY_SIZE, 0, 0, TRUE, pErrorCode);
    303         if(U_FAILURE(*pErrorCode)) {
    304             uprv_free(csp);
    305             return NULL;
    306         }
    307 
    308         csp->formatVersion[0]=1;
    309         csp->formatVersion[2]=UTRIE_SHIFT;
    310         csp->formatVersion[3]=UTRIE_INDEX_SHIFT;
    311 
    312         /* set the static variables */
    313         umtx_lock(NULL);
    314         if(gCspDummy==NULL) {
    315             gCspDummy=csp;
    316             csp=NULL;
    317             ucln_common_registerCleanup(UCLN_COMMON_UCASE, ucase_cleanup);
    318         }
    319         umtx_unlock(NULL);
    320 
    321         uprv_free(csp);
    322         return gCspDummy;
    323     }
    324 }
    325 #endif
    326 
    327 /* set of property starts for UnicodeSet ------------------------------------ */
    328 
    329 static UBool U_CALLCONV
    330 _enumPropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
    331     /* add the start code point to the USet */
    332     const USetAdder *sa=(const USetAdder *)context;
    333     sa->add(sa->set, start);
    334     return TRUE;
    335 }
    336 
    337 U_CFUNC void U_EXPORT2
    338 ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode) {
    339     if(U_FAILURE(*pErrorCode)) {
    340         return;
    341     }
    342 
    343     /* add the start code point of each same-value range of the trie */
    344     utrie2_enum(&csp->trie, NULL, _enumPropertyStartsRange, sa);
    345 
    346     /* add code points with hardcoded properties, plus the ones following them */
    347 
    348     /* (none right now, see comment below) */
    349 
    350     /*
    351      * Omit code points with hardcoded specialcasing properties
    352      * because we do not build property UnicodeSets for them right now.
    353      */
    354 }
    355 
    356 /* data access primitives --------------------------------------------------- */
    357 
    358 #define GET_EXCEPTIONS(csp, props) ((csp)->exceptions+((props)>>UCASE_EXC_SHIFT))
    359 
    360 #define PROPS_HAS_EXCEPTION(props) ((props)&UCASE_EXCEPTION)
    361 
    362 /* number of bits in an 8-bit integer value */
    363 static const uint8_t flagsOffset[256]={
    364     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
    365     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    366     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    367     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    368     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    369     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    370     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    371     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    372     1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
    373     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    374     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    375     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    376     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
    377     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    378     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
    379     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
    380 };
    381 
    382 #define HAS_SLOT(flags, idx) ((flags)&(1<<(idx)))
    383 #define SLOT_OFFSET(flags, idx) flagsOffset[(flags)&((1<<(idx))-1)]
    384 
    385 /*
    386  * Get the value of an optional-value slot where HAS_SLOT(excWord, idx).
    387  *
    388  * @param excWord (in) initial exceptions word
    389  * @param idx (in) desired slot index
    390  * @param pExc16 (in/out) const uint16_t * after excWord=*pExc16++;
    391  *               moved to the last uint16_t of the value, use +1 for beginning of next slot
    392  * @param value (out) int32_t or uint32_t output if hasSlot, otherwise not modified
    393  */
    394 #define GET_SLOT_VALUE(excWord, idx, pExc16, value) \
    395     if(((excWord)&UCASE_EXC_DOUBLE_SLOTS)==0) { \
    396         (pExc16)+=SLOT_OFFSET(excWord, idx); \
    397         (value)=*pExc16; \
    398     } else { \
    399         (pExc16)+=2*SLOT_OFFSET(excWord, idx); \
    400         (value)=*pExc16++; \
    401         (value)=((value)<<16)|*pExc16; \
    402     }
    403 
    404 /* simple case mappings ----------------------------------------------------- */
    405 
    406 U_CAPI UChar32 U_EXPORT2
    407 ucase_tolower(const UCaseProps *csp, UChar32 c) {
    408     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    409     if(!PROPS_HAS_EXCEPTION(props)) {
    410         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
    411             c+=UCASE_GET_DELTA(props);
    412         }
    413     } else {
    414         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
    415         uint16_t excWord=*pe++;
    416         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
    417             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe, c);
    418         }
    419     }
    420     return c;
    421 }
    422 
    423 U_CAPI UChar32 U_EXPORT2
    424 ucase_toupper(const UCaseProps *csp, UChar32 c) {
    425     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    426     if(!PROPS_HAS_EXCEPTION(props)) {
    427         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
    428             c+=UCASE_GET_DELTA(props);
    429         }
    430     } else {
    431         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
    432         uint16_t excWord=*pe++;
    433         if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
    434             GET_SLOT_VALUE(excWord, UCASE_EXC_UPPER, pe, c);
    435         }
    436     }
    437     return c;
    438 }
    439 
    440 U_CAPI UChar32 U_EXPORT2
    441 ucase_totitle(const UCaseProps *csp, UChar32 c) {
    442     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    443     if(!PROPS_HAS_EXCEPTION(props)) {
    444         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
    445             c+=UCASE_GET_DELTA(props);
    446         }
    447     } else {
    448         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
    449         uint16_t excWord=*pe++;
    450         int32_t idx;
    451         if(HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
    452             idx=UCASE_EXC_TITLE;
    453         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
    454             idx=UCASE_EXC_UPPER;
    455         } else {
    456             return c;
    457         }
    458         GET_SLOT_VALUE(excWord, idx, pe, c);
    459     }
    460     return c;
    461 }
    462 
    463 static const UChar iDot[2] = { 0x69, 0x307 };
    464 static const UChar jDot[2] = { 0x6a, 0x307 };
    465 static const UChar iOgonekDot[3] = { 0x12f, 0x307 };
    466 static const UChar iDotGrave[3] = { 0x69, 0x307, 0x300 };
    467 static const UChar iDotAcute[3] = { 0x69, 0x307, 0x301 };
    468 static const UChar iDotTilde[3] = { 0x69, 0x307, 0x303 };
    469 
    470 
    471 U_CFUNC void U_EXPORT2
    472 ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa) {
    473     uint16_t props;
    474 
    475     /*
    476      * Hardcode the case closure of i and its relatives and ignore the
    477      * data file data for these characters.
    478      * The Turkic dotless i and dotted I with their case mapping conditions
    479      * and case folding option make the related characters behave specially.
    480      * This code matches their closure behavior to their case folding behavior.
    481      */
    482 
    483     switch(c) {
    484     case 0x49:
    485         /* regular i and I are in one equivalence class */
    486         sa->add(sa->set, 0x69);
    487         return;
    488     case 0x69:
    489         sa->add(sa->set, 0x49);
    490         return;
    491     case 0x130:
    492         /* dotted I is in a class with <0069 0307> (for canonical equivalence with <0049 0307>) */
    493         sa->addString(sa->set, iDot, 2);
    494         return;
    495     case 0x131:
    496         /* dotless i is in a class by itself */
    497         return;
    498     default:
    499         /* otherwise use the data file data */
    500         break;
    501     }
    502 
    503     props=UTRIE2_GET16(&csp->trie, c);
    504     if(!PROPS_HAS_EXCEPTION(props)) {
    505         if(UCASE_GET_TYPE(props)!=UCASE_NONE) {
    506             /* add the one simple case mapping, no matter what type it is */
    507             int32_t delta=UCASE_GET_DELTA(props);
    508             if(delta!=0) {
    509                 sa->add(sa->set, c+delta);
    510             }
    511         }
    512     } else {
    513         /*
    514          * c has exceptions, so there may be multiple simple and/or
    515          * full case mappings. Add them all.
    516          */
    517         const uint16_t *pe0, *pe=GET_EXCEPTIONS(csp, props);
    518         const UChar *closure;
    519         uint16_t excWord=*pe++;
    520         int32_t idx, closureLength, fullLength, length;
    521 
    522         pe0=pe;
    523 
    524         /* add all simple case mappings */
    525         for(idx=UCASE_EXC_LOWER; idx<=UCASE_EXC_TITLE; ++idx) {
    526             if(HAS_SLOT(excWord, idx)) {
    527                 pe=pe0;
    528                 GET_SLOT_VALUE(excWord, idx, pe, c);
    529                 sa->add(sa->set, c);
    530             }
    531         }
    532 
    533         /* get the closure string pointer & length */
    534         if(HAS_SLOT(excWord, UCASE_EXC_CLOSURE)) {
    535             pe=pe0;
    536             GET_SLOT_VALUE(excWord, UCASE_EXC_CLOSURE, pe, closureLength);
    537             closureLength&=UCASE_CLOSURE_MAX_LENGTH; /* higher bits are reserved */
    538             closure=(const UChar *)pe+1; /* behind this slot, unless there are full case mappings */
    539         } else {
    540             closureLength=0;
    541             closure=NULL;
    542         }
    543 
    544         /* add the full case folding */
    545         if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
    546             pe=pe0;
    547             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, fullLength);
    548 
    549             /* start of full case mapping strings */
    550             ++pe;
    551 
    552             fullLength&=0xffff; /* bits 16 and higher are reserved */
    553 
    554             /* skip the lowercase result string */
    555             pe+=fullLength&UCASE_FULL_LOWER;
    556             fullLength>>=4;
    557 
    558             /* add the full case folding string */
    559             length=fullLength&0xf;
    560             if(length!=0) {
    561                 sa->addString(sa->set, (const UChar *)pe, length);
    562                 pe+=length;
    563             }
    564 
    565             /* skip the uppercase and titlecase strings */
    566             fullLength>>=4;
    567             pe+=fullLength&0xf;
    568             fullLength>>=4;
    569             pe+=fullLength;
    570 
    571             closure=(const UChar *)pe; /* behind full case mappings */
    572         }
    573 
    574         /* add each code point in the closure string */
    575         for(idx=0; idx<closureLength;) {
    576             U16_NEXT_UNSAFE(closure, idx, c);
    577             sa->add(sa->set, c);
    578         }
    579     }
    580 }
    581 
    582 /*
    583  * compare s, which has a length, with t, which has a maximum length or is NUL-terminated
    584  * must be length>0 and max>0 and length<=max
    585  */
    586 static U_INLINE int32_t
    587 strcmpMax(const UChar *s, int32_t length, const UChar *t, int32_t max) {
    588     int32_t c1, c2;
    589 
    590     max-=length; /* we require length<=max, so no need to decrement max in the loop */
    591     do {
    592         c1=*s++;
    593         c2=*t++;
    594         if(c2==0) {
    595             return 1; /* reached the end of t but not of s */
    596         }
    597         c1-=c2;
    598         if(c1!=0) {
    599             return c1; /* return difference result */
    600         }
    601     } while(--length>0);
    602     /* ends with length==0 */
    603 
    604     if(max==0 || *t==0) {
    605         return 0; /* equal to length of both strings */
    606     } else {
    607         return -max; /* return lengh difference */
    608     }
    609 }
    610 
    611 U_CFUNC UBool U_EXPORT2
    612 ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa) {
    613     const UChar *unfold, *p;
    614     int32_t i, start, limit, result, unfoldRows, unfoldRowWidth, unfoldStringWidth;
    615 
    616     if(csp->unfold==NULL || s==NULL) {
    617         return FALSE; /* no reverse case folding data, or no string */
    618     }
    619     if(length<=1) {
    620         /* the string is too short to find any match */
    621         /*
    622          * more precise would be:
    623          * if(!u_strHasMoreChar32Than(s, length, 1))
    624          * but this does not make much practical difference because
    625          * a single supplementary code point would just not be found
    626          */
    627         return FALSE;
    628     }
    629 
    630     unfold=csp->unfold;
    631     unfoldRows=unfold[UCASE_UNFOLD_ROWS];
    632     unfoldRowWidth=unfold[UCASE_UNFOLD_ROW_WIDTH];
    633     unfoldStringWidth=unfold[UCASE_UNFOLD_STRING_WIDTH];
    634     unfold+=unfoldRowWidth;
    635 
    636     if(length>unfoldStringWidth) {
    637         /* the string is too long to find any match */
    638         return FALSE;
    639     }
    640 
    641     /* do a binary search for the string */
    642     start=0;
    643     limit=unfoldRows;
    644     while(start<limit) {
    645         i=(start+limit)/2;
    646         p=unfold+(i*unfoldRowWidth);
    647         result=strcmpMax(s, length, p, unfoldStringWidth);
    648 
    649         if(result==0) {
    650             /* found the string: add each code point, and its case closure */
    651             UChar32 c;
    652 
    653             for(i=unfoldStringWidth; i<unfoldRowWidth && p[i]!=0;) {
    654                 U16_NEXT_UNSAFE(p, i, c);
    655                 sa->add(sa->set, c);
    656                 ucase_addCaseClosure(csp, c, sa);
    657             }
    658             return TRUE;
    659         } else if(result<0) {
    660             limit=i;
    661         } else /* result>0 */ {
    662             start=i+1;
    663         }
    664     }
    665 
    666     return FALSE; /* string not found */
    667 }
    668 
    669 /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
    670 U_CAPI int32_t U_EXPORT2
    671 ucase_getType(const UCaseProps *csp, UChar32 c) {
    672     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    673     return UCASE_GET_TYPE(props);
    674 }
    675 
    676 /** @return same as ucase_getType() and set bit 2 if c is case-ignorable */
    677 U_CAPI int32_t U_EXPORT2
    678 ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c) {
    679     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    680     int32_t type=UCASE_GET_TYPE(props);
    681     if(props&UCASE_EXCEPTION) {
    682         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
    683         if(*pe&UCASE_EXC_CASE_IGNORABLE) {
    684             type|=4;
    685         }
    686     } else if(type==UCASE_NONE && (props&UCASE_CASE_IGNORABLE)) {
    687         type|=4;
    688     }
    689     return type;
    690 }
    691 
    692 /** @return UCASE_NO_DOT, UCASE_SOFT_DOTTED, UCASE_ABOVE, UCASE_OTHER_ACCENT */
    693 static U_INLINE int32_t
    694 getDotType(const UCaseProps *csp, UChar32 c) {
    695     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    696     if(!PROPS_HAS_EXCEPTION(props)) {
    697         return props&UCASE_DOT_MASK;
    698     } else {
    699         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
    700         return (*pe>>UCASE_EXC_DOT_SHIFT)&UCASE_DOT_MASK;
    701     }
    702 }
    703 
    704 U_CAPI UBool U_EXPORT2
    705 ucase_isSoftDotted(const UCaseProps *csp, UChar32 c) {
    706     return (UBool)(getDotType(csp, c)==UCASE_SOFT_DOTTED);
    707 }
    708 
    709 U_CAPI UBool U_EXPORT2
    710 ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c) {
    711     uint16_t props=UTRIE2_GET16(&csp->trie, c);
    712     return (UBool)((props&UCASE_SENSITIVE)!=0);
    713 }
    714 
    715 /* string casing ------------------------------------------------------------ */
    716 
    717 /*
    718  * These internal functions form the core of string case mappings.
    719  * They map single code points to result code points or strings and take
    720  * all necessary conditions (context, locale ID, options) into account.
    721  *
    722  * They do not iterate over the source or write to the destination
    723  * so that the same functions are useful for non-standard string storage,
    724  * such as in a Replaceable (for Transliterator) or UTF-8/32 strings etc.
    725  * For the same reason, the "surrounding text" context is passed in as a
    726  * UCaseContextIterator which does not make any assumptions about
    727  * the underlying storage.
    728  *
    729  * This section contains helper functions that check for conditions
    730  * in the input text surrounding the current code point
    731  * according to SpecialCasing.txt.
    732  *
    733  * Each helper function gets the index
    734  * - after the current code point if it looks at following text
    735  * - before the current code point if it looks at preceding text
    736  *
    737  * Unicode 3.2 UAX 21 "Case Mappings" defines the conditions as follows:
    738  *
    739  * Final_Sigma
    740  *   C is preceded by a sequence consisting of
    741  *     a cased letter and a case-ignorable sequence,
    742  *   and C is not followed by a sequence consisting of
    743  *     an ignorable sequence and then a cased letter.
    744  *
    745  * More_Above
    746  *   C is followed by one or more characters of combining class 230 (ABOVE)
    747  *   in the combining character sequence.
    748  *
    749  * After_Soft_Dotted
    750  *   The last preceding character with combining class of zero before C
    751  *   was Soft_Dotted,
    752  *   and there is no intervening combining character class 230 (ABOVE).
    753  *
    754  * Before_Dot
    755  *   C is followed by combining dot above (U+0307).
    756  *   Any sequence of characters with a combining class that is neither 0 nor 230
    757  *   may intervene between the current character and the combining dot above.
    758  *
    759  * The erratum from 2002-10-31 adds the condition
    760  *
    761  * After_I
    762  *   The last preceding base character was an uppercase I, and there is no
    763  *   intervening combining character class 230 (ABOVE).
    764  *
    765  *   (See Jitterbug 2344 and the comments on After_I below.)
    766  *
    767  * Helper definitions in Unicode 3.2 UAX 21:
    768  *
    769  * D1. A character C is defined to be cased
    770  *     if it meets any of the following criteria:
    771  *
    772  *   - The general category of C is Titlecase Letter (Lt)
    773  *   - In [CoreProps], C has one of the properties Uppercase, or Lowercase
    774  *   - Given D = NFD(C), then it is not the case that:
    775  *     D = UCD_lower(D) = UCD_upper(D) = UCD_title(D)
    776  *     (This third criterium does not add any characters to the list
    777  *      for Unicode 3.2. Ignored.)
    778  *
    779  * D2. A character C is defined to be case-ignorable
    780  *     if it meets either of the following criteria:
    781  *
    782  *   - The general category of C is
    783  *     Nonspacing Mark (Mn), or Enclosing Mark (Me), or Format Control (Cf), or
    784  *     Letter Modifier (Lm), or Symbol Modifier (Sk)
    785  *   - C is one of the following characters
    786  *     U+0027 APOSTROPHE
    787  *     U+00AD SOFT HYPHEN (SHY)
    788  *     U+2019 RIGHT SINGLE QUOTATION MARK
    789  *            (the preferred character for apostrophe)
    790  *
    791  * D3. A case-ignorable sequence is a sequence of
    792  *     zero or more case-ignorable characters.
    793  */
    794 
    795 #define is_a(c) ((c)=='a' || (c)=='A')
    796 #define is_d(c) ((c)=='d' || (c)=='D')
    797 #define is_e(c) ((c)=='e' || (c)=='E')
    798 #define is_i(c) ((c)=='i' || (c)=='I')
    799 #define is_l(c) ((c)=='l' || (c)=='L')
    800 #define is_n(c) ((c)=='n' || (c)=='N')
    801 #define is_r(c) ((c)=='r' || (c)=='R')
    802 #define is_t(c) ((c)=='t' || (c)=='T')
    803 #define is_u(c) ((c)=='u' || (c)=='U')
    804 #define is_z(c) ((c)=='z' || (c)=='Z')
    805 
    806 /* separator? */
    807 #define is_sep(c) ((c)=='_' || (c)=='-' || (c)==0)
    808 
    809 /**
    810  * Requires non-NULL locale ID but otherwise does the equivalent of
    811  * checking for language codes as if uloc_getLanguage() were called:
    812  * Accepts both 2- and 3-letter codes and accepts case variants.
    813  */
    814 U_CFUNC int32_t
    815 ucase_getCaseLocale(const char *locale, int32_t *locCache) {
    816     int32_t result;
    817     char c;
    818 
    819     if(locCache!=NULL && (result=*locCache)!=UCASE_LOC_UNKNOWN) {
    820         return result;
    821     }
    822 
    823     result=UCASE_LOC_ROOT;
    824 
    825     /*
    826      * This function used to use uloc_getLanguage(), but the current code
    827      * removes the dependency of this low-level code on uloc implementation code
    828      * and is faster because not the whole locale ID has to be
    829      * examined and copied/transformed.
    830      *
    831      * Because this code does not want to depend on uloc, the caller must
    832      * pass in a non-NULL locale, i.e., may need to call uloc_getDefault().
    833      */
    834     c=*locale++;
    835     if(is_t(c)) {
    836         /* tr or tur? */
    837         c=*locale++;
    838         if(is_u(c)) {
    839             c=*locale++;
    840         }
    841         if(is_r(c)) {
    842             c=*locale;
    843             if(is_sep(c)) {
    844                 result=UCASE_LOC_TURKISH;
    845             }
    846         }
    847     } else if(is_a(c)) {
    848         /* az or aze? */
    849         c=*locale++;
    850         if(is_z(c)) {
    851             c=*locale++;
    852             if(is_e(c)) {
    853                 c=*locale;
    854             }
    855             if(is_sep(c)) {
    856                 result=UCASE_LOC_TURKISH;
    857             }
    858         }
    859     } else if(is_l(c)) {
    860         /* lt or lit? */
    861         c=*locale++;
    862         if(is_i(c)) {
    863             c=*locale++;
    864         }
    865         if(is_t(c)) {
    866             c=*locale;
    867             if(is_sep(c)) {
    868                 result=UCASE_LOC_LITHUANIAN;
    869             }
    870         }
    871     } else if(is_n(c)) {
    872         /* nl or nld? */
    873         c=*locale++;
    874         if(is_l(c)) {
    875             c=*locale++;
    876             if(is_d(c)) {
    877                 c=*locale;
    878             }
    879             if(is_sep(c)) {
    880                 result=UCASE_LOC_DUTCH;
    881             }
    882         }
    883     }
    884 
    885     if(locCache!=NULL) {
    886         *locCache=result;
    887     }
    888     return result;
    889 }
    890 
    891 /*
    892  * Is followed by
    893  *   {case-ignorable}* cased
    894  * ?
    895  * (dir determines looking forward/backward)
    896  * If a character is case-ignorable, it is skipped regardless of whether
    897  * it is also cased or not.
    898  */
    899 static UBool
    900 isFollowedByCasedLetter(const UCaseProps *csp, UCaseContextIterator *iter, void *context, int8_t dir) {
    901     UChar32 c;
    902 
    903     if(iter==NULL) {
    904         return FALSE;
    905     }
    906 
    907     for(/* dir!=0 sets direction */; (c=iter(context, dir))>=0; dir=0) {
    908         int32_t type=ucase_getTypeOrIgnorable(csp, c);
    909         if(type&4) {
    910             /* case-ignorable, continue with the loop */
    911         } else if(type!=UCASE_NONE) {
    912             return TRUE; /* followed by cased letter */
    913         } else {
    914             return FALSE; /* uncased and not case-ignorable */
    915         }
    916     }
    917 
    918     return FALSE; /* not followed by cased letter */
    919 }
    920 
    921 /* Is preceded by Soft_Dotted character with no intervening cc=230 ? */
    922 static UBool
    923 isPrecededBySoftDotted(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
    924     UChar32 c;
    925     int32_t dotType;
    926     int8_t dir;
    927 
    928     if(iter==NULL) {
    929         return FALSE;
    930     }
    931 
    932     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
    933         dotType=getDotType(csp, c);
    934         if(dotType==UCASE_SOFT_DOTTED) {
    935             return TRUE; /* preceded by TYPE_i */
    936         } else if(dotType!=UCASE_OTHER_ACCENT) {
    937             return FALSE; /* preceded by different base character (not TYPE_i), or intervening cc==230 */
    938         }
    939     }
    940 
    941     return FALSE; /* not preceded by TYPE_i */
    942 }
    943 
    944 /*
    945  * See Jitterbug 2344:
    946  * The condition After_I for Turkic-lowercasing of U+0307 combining dot above
    947  * is checked in ICU 2.0, 2.1, 2.6 but was not in 2.2 & 2.4 because
    948  * we made those releases compatible with Unicode 3.2 which had not fixed
    949  * a related bug in SpecialCasing.txt.
    950  *
    951  * From the Jitterbug 2344 text:
    952  * ... this bug is listed as a Unicode erratum
    953  * from 2002-10-31 at http://www.unicode.org/uni2errata/UnicodeErrata.html
    954  * <quote>
    955  * There are two errors in SpecialCasing.txt.
    956  * 1. Missing semicolons on two lines. ... [irrelevant for ICU]
    957  * 2. An incorrect context definition. Correct as follows:
    958  * < 0307; ; 0307; 0307; tr After_Soft_Dotted; # COMBINING DOT ABOVE
    959  * < 0307; ; 0307; 0307; az After_Soft_Dotted; # COMBINING DOT ABOVE
    960  * ---
    961  * > 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
    962  * > 0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
    963  * where the context After_I is defined as:
    964  * The last preceding base character was an uppercase I, and there is no
    965  * intervening combining character class 230 (ABOVE).
    966  * </quote>
    967  *
    968  * Note that SpecialCasing.txt even in Unicode 3.2 described the condition as:
    969  *
    970  * # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
    971  * # This matches the behavior of the canonically equivalent I-dot_above
    972  *
    973  * See also the description in this place in older versions of uchar.c (revision 1.100).
    974  *
    975  * Markus W. Scherer 2003-feb-15
    976  */
    977 
    978 /* Is preceded by base character 'I' with no intervening cc=230 ? */
    979 static UBool
    980 isPrecededBy_I(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
    981     UChar32 c;
    982     int32_t dotType;
    983     int8_t dir;
    984 
    985     if(iter==NULL) {
    986         return FALSE;
    987     }
    988 
    989     for(dir=-1; (c=iter(context, dir))>=0; dir=0) {
    990         if(c==0x49) {
    991             return TRUE; /* preceded by I */
    992         }
    993         dotType=getDotType(csp, c);
    994         if(dotType!=UCASE_OTHER_ACCENT) {
    995             return FALSE; /* preceded by different base character (not I), or intervening cc==230 */
    996         }
    997     }
    998 
    999     return FALSE; /* not preceded by I */
   1000 }
   1001 
   1002 /* Is followed by one or more cc==230 ? */
   1003 static UBool
   1004 isFollowedByMoreAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
   1005     UChar32 c;
   1006     int32_t dotType;
   1007     int8_t dir;
   1008 
   1009     if(iter==NULL) {
   1010         return FALSE;
   1011     }
   1012 
   1013     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
   1014         dotType=getDotType(csp, c);
   1015         if(dotType==UCASE_ABOVE) {
   1016             return TRUE; /* at least one cc==230 following */
   1017         } else if(dotType!=UCASE_OTHER_ACCENT) {
   1018             return FALSE; /* next base character, no more cc==230 following */
   1019         }
   1020     }
   1021 
   1022     return FALSE; /* no more cc==230 following */
   1023 }
   1024 
   1025 /* Is followed by a dot above (without cc==230 in between) ? */
   1026 static UBool
   1027 isFollowedByDotAbove(const UCaseProps *csp, UCaseContextIterator *iter, void *context) {
   1028     UChar32 c;
   1029     int32_t dotType;
   1030     int8_t dir;
   1031 
   1032     if(iter==NULL) {
   1033         return FALSE;
   1034     }
   1035 
   1036     for(dir=1; (c=iter(context, dir))>=0; dir=0) {
   1037         if(c==0x307) {
   1038             return TRUE;
   1039         }
   1040         dotType=getDotType(csp, c);
   1041         if(dotType!=UCASE_OTHER_ACCENT) {
   1042             return FALSE; /* next base character or cc==230 in between */
   1043         }
   1044     }
   1045 
   1046     return FALSE; /* no dot above following */
   1047 }
   1048 
   1049 U_CAPI int32_t U_EXPORT2
   1050 ucase_toFullLower(const UCaseProps *csp, UChar32 c,
   1051                   UCaseContextIterator *iter, void *context,
   1052                   const UChar **pString,
   1053                   const char *locale, int32_t *locCache)
   1054 {
   1055     UChar32 result=c;
   1056     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   1057     if(!PROPS_HAS_EXCEPTION(props)) {
   1058         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
   1059             result=c+UCASE_GET_DELTA(props);
   1060         }
   1061     } else {
   1062         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
   1063         uint16_t excWord=*pe++;
   1064         int32_t full;
   1065 
   1066         pe2=pe;
   1067 
   1068         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
   1069             /* use hardcoded conditions and mappings */
   1070             int32_t loc=ucase_getCaseLocale(locale, locCache);
   1071 
   1072             /*
   1073              * Test for conditional mappings first
   1074              *   (otherwise the unconditional default mappings are always taken),
   1075              * then test for characters that have unconditional mappings in SpecialCasing.txt,
   1076              * then get the UnicodeData.txt mappings.
   1077              */
   1078             if( loc==UCASE_LOC_LITHUANIAN &&
   1079                     /* base characters, find accents above */
   1080                     (((c==0x49 || c==0x4a || c==0x12e) &&
   1081                         isFollowedByMoreAbove(csp, iter, context)) ||
   1082                     /* precomposed with accent above, no need to find one */
   1083                     (c==0xcc || c==0xcd || c==0x128))
   1084             ) {
   1085                 /*
   1086                     # Lithuanian
   1087 
   1088                     # Lithuanian retains the dot in a lowercase i when followed by accents.
   1089 
   1090                     # Introduce an explicit dot above when lowercasing capital I's and J's
   1091                     # whenever there are more accents above.
   1092                     # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek)
   1093 
   1094                     0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I
   1095                     004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J
   1096                     012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK
   1097                     00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE
   1098                     00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE
   1099                     0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE
   1100                  */
   1101                 switch(c) {
   1102                 case 0x49:  /* LATIN CAPITAL LETTER I */
   1103                     *pString=iDot;
   1104                     return 2;
   1105                 case 0x4a:  /* LATIN CAPITAL LETTER J */
   1106                     *pString=jDot;
   1107                     return 2;
   1108                 case 0x12e: /* LATIN CAPITAL LETTER I WITH OGONEK */
   1109                     *pString=iOgonekDot;
   1110                     return 2;
   1111                 case 0xcc:  /* LATIN CAPITAL LETTER I WITH GRAVE */
   1112                     *pString=iDotGrave;
   1113                     return 3;
   1114                 case 0xcd:  /* LATIN CAPITAL LETTER I WITH ACUTE */
   1115                     *pString=iDotAcute;
   1116                     return 3;
   1117                 case 0x128: /* LATIN CAPITAL LETTER I WITH TILDE */
   1118                     *pString=iDotTilde;
   1119                     return 3;
   1120                 default:
   1121                     return 0; /* will not occur */
   1122                 }
   1123             /* # Turkish and Azeri */
   1124             } else if(loc==UCASE_LOC_TURKISH && c==0x130) {
   1125                 /*
   1126                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
   1127                     # The following rules handle those cases.
   1128 
   1129                     0130; 0069; 0130; 0130; tr # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1130                     0130; 0069; 0130; 0130; az # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1131                  */
   1132                 return 0x69;
   1133             } else if(loc==UCASE_LOC_TURKISH && c==0x307 && isPrecededBy_I(csp, iter, context)) {
   1134                 /*
   1135                     # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i.
   1136                     # This matches the behavior of the canonically equivalent I-dot_above
   1137 
   1138                     0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
   1139                     0307; ; 0307; 0307; az After_I; # COMBINING DOT ABOVE
   1140                  */
   1141                 return 0; /* remove the dot (continue without output) */
   1142             } else if(loc==UCASE_LOC_TURKISH && c==0x49 && !isFollowedByDotAbove(csp, iter, context)) {
   1143                 /*
   1144                     # When lowercasing, unless an I is before a dot_above, it turns into a dotless i.
   1145 
   1146                     0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I
   1147                     0049; 0131; 0049; 0049; az Not_Before_Dot; # LATIN CAPITAL LETTER I
   1148                  */
   1149                 return 0x131;
   1150             } else if(c==0x130) {
   1151                 /*
   1152                     # Preserve canonical equivalence for I with dot. Turkic is handled below.
   1153 
   1154                     0130; 0069 0307; 0130; 0130; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1155                  */
   1156                 *pString=iDot;
   1157                 return 2;
   1158             } else if(  c==0x3a3 &&
   1159                         !isFollowedByCasedLetter(csp, iter, context, 1) &&
   1160                         isFollowedByCasedLetter(csp, iter, context, -1) /* -1=preceded */
   1161             ) {
   1162                 /* greek capital sigma maps depending on surrounding cased letters (see SpecialCasing.txt) */
   1163                 /*
   1164                     # Special case for final form of sigma
   1165 
   1166                     03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA
   1167                  */
   1168                 return 0x3c2; /* greek small final sigma */
   1169             } else {
   1170                 /* no known conditional special case mapping, use a normal mapping */
   1171             }
   1172         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
   1173             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
   1174             full&=UCASE_FULL_LOWER;
   1175             if(full!=0) {
   1176                 /* set the output pointer to the lowercase mapping */
   1177                 *pString=pe+1;
   1178 
   1179                 /* return the string length */
   1180                 return full;
   1181             }
   1182         }
   1183 
   1184         if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
   1185             GET_SLOT_VALUE(excWord, UCASE_EXC_LOWER, pe2, result);
   1186         }
   1187     }
   1188 
   1189     return (result==c) ? ~result : result;
   1190 }
   1191 
   1192 /* internal */
   1193 static int32_t
   1194 toUpperOrTitle(const UCaseProps *csp, UChar32 c,
   1195                UCaseContextIterator *iter, void *context,
   1196                const UChar **pString,
   1197                const char *locale, int32_t *locCache,
   1198                UBool upperNotTitle) {
   1199     UChar32 result=c;
   1200     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   1201     if(!PROPS_HAS_EXCEPTION(props)) {
   1202         if(UCASE_GET_TYPE(props)==UCASE_LOWER) {
   1203             result=c+UCASE_GET_DELTA(props);
   1204         }
   1205     } else {
   1206         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
   1207         uint16_t excWord=*pe++;
   1208         int32_t full, idx;
   1209 
   1210         pe2=pe;
   1211 
   1212         if(excWord&UCASE_EXC_CONDITIONAL_SPECIAL) {
   1213             /* use hardcoded conditions and mappings */
   1214             int32_t loc=ucase_getCaseLocale(locale, locCache);
   1215 
   1216             if(loc==UCASE_LOC_TURKISH && c==0x69) {
   1217                 /*
   1218                     # Turkish and Azeri
   1219 
   1220                     # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri
   1221                     # The following rules handle those cases.
   1222 
   1223                     # When uppercasing, i turns into a dotted capital I
   1224 
   1225                     0069; 0069; 0130; 0130; tr; # LATIN SMALL LETTER I
   1226                     0069; 0069; 0130; 0130; az; # LATIN SMALL LETTER I
   1227                 */
   1228                 return 0x130;
   1229             } else if(loc==UCASE_LOC_LITHUANIAN && c==0x307 && isPrecededBySoftDotted(csp, iter, context)) {
   1230                 /*
   1231                     # Lithuanian
   1232 
   1233                     # Lithuanian retains the dot in a lowercase i when followed by accents.
   1234 
   1235                     # Remove DOT ABOVE after "i" with upper or titlecase
   1236 
   1237                     0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE
   1238                  */
   1239                 return 0; /* remove the dot (continue without output) */
   1240             } else {
   1241                 /* no known conditional special case mapping, use a normal mapping */
   1242             }
   1243         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
   1244             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
   1245 
   1246             /* start of full case mapping strings */
   1247             ++pe;
   1248 
   1249             /* skip the lowercase and case-folding result strings */
   1250             pe+=full&UCASE_FULL_LOWER;
   1251             full>>=4;
   1252             pe+=full&0xf;
   1253             full>>=4;
   1254 
   1255             if(upperNotTitle) {
   1256                 full&=0xf;
   1257             } else {
   1258                 /* skip the uppercase result string */
   1259                 pe+=full&0xf;
   1260                 full=(full>>4)&0xf;
   1261             }
   1262 
   1263             if(full!=0) {
   1264                 /* set the output pointer to the result string */
   1265                 *pString=pe;
   1266 
   1267                 /* return the string length */
   1268                 return full;
   1269             }
   1270         }
   1271 
   1272         if(!upperNotTitle && HAS_SLOT(excWord, UCASE_EXC_TITLE)) {
   1273             idx=UCASE_EXC_TITLE;
   1274         } else if(HAS_SLOT(excWord, UCASE_EXC_UPPER)) {
   1275             /* here, titlecase is same as uppercase */
   1276             idx=UCASE_EXC_UPPER;
   1277         } else {
   1278             return ~c;
   1279         }
   1280         GET_SLOT_VALUE(excWord, idx, pe2, result);
   1281     }
   1282 
   1283     return (result==c) ? ~result : result;
   1284 }
   1285 
   1286 U_CAPI int32_t U_EXPORT2
   1287 ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
   1288                   UCaseContextIterator *iter, void *context,
   1289                   const UChar **pString,
   1290                   const char *locale, int32_t *locCache) {
   1291     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, TRUE);
   1292 }
   1293 
   1294 U_CAPI int32_t U_EXPORT2
   1295 ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
   1296                   UCaseContextIterator *iter, void *context,
   1297                   const UChar **pString,
   1298                   const char *locale, int32_t *locCache) {
   1299     return toUpperOrTitle(csp, c, iter, context, pString, locale, locCache, FALSE);
   1300 }
   1301 
   1302 /* case folding ------------------------------------------------------------- */
   1303 
   1304 /*
   1305  * Case folding is similar to lowercasing.
   1306  * The result may be a simple mapping, i.e., a single code point, or
   1307  * a full mapping, i.e., a string.
   1308  * If the case folding for a code point is the same as its simple (1:1) lowercase mapping,
   1309  * then only the lowercase mapping is stored.
   1310  *
   1311  * Some special cases are hardcoded because their conditions cannot be
   1312  * parsed and processed from CaseFolding.txt.
   1313  *
   1314  * Unicode 3.2 CaseFolding.txt specifies for its status field:
   1315 
   1316 # C: common case folding, common mappings shared by both simple and full mappings.
   1317 # F: full case folding, mappings that cause strings to grow in length. Multiple characters are separated by spaces.
   1318 # S: simple case folding, mappings to single characters where different from F.
   1319 # T: special case for uppercase I and dotted uppercase I
   1320 #    - For non-Turkic languages, this mapping is normally not used.
   1321 #    - For Turkic languages (tr, az), this mapping can be used instead of the normal mapping for these characters.
   1322 #
   1323 # Usage:
   1324 #  A. To do a simple case folding, use the mappings with status C + S.
   1325 #  B. To do a full case folding, use the mappings with status C + F.
   1326 #
   1327 #    The mappings with status T can be used or omitted depending on the desired case-folding
   1328 #    behavior. (The default option is to exclude them.)
   1329 
   1330  * Unicode 3.2 has 'T' mappings as follows:
   1331 
   1332 0049; T; 0131; # LATIN CAPITAL LETTER I
   1333 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1334 
   1335  * while the default mappings for these code points are:
   1336 
   1337 0049; C; 0069; # LATIN CAPITAL LETTER I
   1338 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE
   1339 
   1340  * U+0130 has no simple case folding (simple-case-folds to itself).
   1341  */
   1342 
   1343 /* return the simple case folding mapping for c */
   1344 U_CAPI UChar32 U_EXPORT2
   1345 ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options) {
   1346     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   1347     if(!PROPS_HAS_EXCEPTION(props)) {
   1348         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
   1349             c+=UCASE_GET_DELTA(props);
   1350         }
   1351     } else {
   1352         const uint16_t *pe=GET_EXCEPTIONS(csp, props);
   1353         uint16_t excWord=*pe++;
   1354         int32_t idx;
   1355         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
   1356             /* special case folding mappings, hardcoded */
   1357             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
   1358                 /* default mappings */
   1359                 if(c==0x49) {
   1360                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
   1361                     return 0x69;
   1362                 } else if(c==0x130) {
   1363                     /* no simple case folding for U+0130 */
   1364                     return c;
   1365                 }
   1366             } else {
   1367                 /* Turkic mappings */
   1368                 if(c==0x49) {
   1369                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
   1370                     return 0x131;
   1371                 } else if(c==0x130) {
   1372                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1373                     return 0x69;
   1374                 }
   1375             }
   1376         }
   1377         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
   1378             idx=UCASE_EXC_FOLD;
   1379         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
   1380             idx=UCASE_EXC_LOWER;
   1381         } else {
   1382             return c;
   1383         }
   1384         GET_SLOT_VALUE(excWord, idx, pe, c);
   1385     }
   1386     return c;
   1387 }
   1388 
   1389 /*
   1390  * Issue for canonical caseless match (UAX #21):
   1391  * Turkic casefolding (using "T" mappings in CaseFolding.txt) does not preserve
   1392  * canonical equivalence, unlike default-option casefolding.
   1393  * For example, I-grave and I + grave fold to strings that are not canonically
   1394  * equivalent.
   1395  * For more details, see the comment in unorm_compare() in unorm.cpp
   1396  * and the intermediate prototype changes for Jitterbug 2021.
   1397  * (For example, revision 1.104 of uchar.c and 1.4 of CaseFolding.txt.)
   1398  *
   1399  * This did not get fixed because it appears that it is not possible to fix
   1400  * it for uppercase and lowercase characters (I-grave vs. i-grave)
   1401  * together in a way that they still fold to common result strings.
   1402  */
   1403 
   1404 U_CAPI int32_t U_EXPORT2
   1405 ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
   1406                     const UChar **pString,
   1407                     uint32_t options)
   1408 {
   1409     UChar32 result=c;
   1410     uint16_t props=UTRIE2_GET16(&csp->trie, c);
   1411     if(!PROPS_HAS_EXCEPTION(props)) {
   1412         if(UCASE_GET_TYPE(props)>=UCASE_UPPER) {
   1413             result=c+UCASE_GET_DELTA(props);
   1414         }
   1415     } else {
   1416         const uint16_t *pe=GET_EXCEPTIONS(csp, props), *pe2;
   1417         uint16_t excWord=*pe++;
   1418         int32_t full, idx;
   1419 
   1420         pe2=pe;
   1421 
   1422         if(excWord&UCASE_EXC_CONDITIONAL_FOLD) {
   1423             /* use hardcoded conditions and mappings */
   1424             if((options&_FOLD_CASE_OPTIONS_MASK)==U_FOLD_CASE_DEFAULT) {
   1425                 /* default mappings */
   1426                 if(c==0x49) {
   1427                     /* 0049; C; 0069; # LATIN CAPITAL LETTER I */
   1428                     return 0x69;
   1429                 } else if(c==0x130) {
   1430                     /* 0130; F; 0069 0307; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1431                     *pString=iDot;
   1432                     return 2;
   1433                 }
   1434             } else {
   1435                 /* Turkic mappings */
   1436                 if(c==0x49) {
   1437                     /* 0049; T; 0131; # LATIN CAPITAL LETTER I */
   1438                     return 0x131;
   1439                 } else if(c==0x130) {
   1440                     /* 0130; T; 0069; # LATIN CAPITAL LETTER I WITH DOT ABOVE */
   1441                     return 0x69;
   1442                 }
   1443             }
   1444         } else if(HAS_SLOT(excWord, UCASE_EXC_FULL_MAPPINGS)) {
   1445             GET_SLOT_VALUE(excWord, UCASE_EXC_FULL_MAPPINGS, pe, full);
   1446 
   1447             /* start of full case mapping strings */
   1448             ++pe;
   1449 
   1450             /* skip the lowercase result string */
   1451             pe+=full&UCASE_FULL_LOWER;
   1452             full=(full>>4)&0xf;
   1453 
   1454             if(full!=0) {
   1455                 /* set the output pointer to the result string */
   1456                 *pString=pe;
   1457 
   1458                 /* return the string length */
   1459                 return full;
   1460             }
   1461         }
   1462 
   1463         if(HAS_SLOT(excWord, UCASE_EXC_FOLD)) {
   1464             idx=UCASE_EXC_FOLD;
   1465         } else if(HAS_SLOT(excWord, UCASE_EXC_LOWER)) {
   1466             idx=UCASE_EXC_LOWER;
   1467         } else {
   1468             return ~c;
   1469         }
   1470         GET_SLOT_VALUE(excWord, idx, pe2, result);
   1471     }
   1472 
   1473     return (result==c) ? ~result : result;
   1474 }
   1475 
   1476 /* case mapping properties API ---------------------------------------------- */
   1477 
   1478 /* get the UCaseProps singleton, or else its dummy, once and for all */
   1479 #if !UCASE_HARDCODE_DATA
   1480 static const UCaseProps *
   1481 getCaseProps() {
   1482     /*
   1483      * This lazy intialization with double-checked locking (without mutex protection for
   1484      * the initial check) is transiently unsafe under certain circumstances.
   1485      * Check the readme and use u_init() if necessary.
   1486      */
   1487 
   1488     /* the initial check is performed by the GET_CASE_PROPS() macro */
   1489     const UCaseProps *csp;
   1490     UErrorCode errorCode=U_ZERO_ERROR;
   1491 
   1492     csp=ucase_getSingleton(&errorCode);
   1493     if(U_FAILURE(errorCode)) {
   1494         errorCode=U_ZERO_ERROR;
   1495         csp=ucase_getDummy(&errorCode);
   1496         if(U_FAILURE(errorCode)) {
   1497             return NULL;
   1498         }
   1499     }
   1500 
   1501     return csp;
   1502 }
   1503 #endif
   1504 
   1505 /*
   1506  * In ICU 3.0, most Unicode properties were loaded from uprops.icu.
   1507  * ICU 3.2 adds ucase.icu for case mapping properties.
   1508  * ICU 3.4 adds ubidi.icu for bidi/shaping properties and
   1509  * removes case/bidi/shaping properties from uprops.icu.
   1510  *
   1511  * Loading of uprops.icu was never mutex-protected and required u_init()
   1512  * for thread safety.
   1513  * In order to maintain performance for all such properties,
   1514  * ucase.icu and ubidi.icu are loaded lazily, without mutexing.
   1515  * u_init() will try to load them for thread safety,
   1516  * but u_init() will not fail if they are missing.
   1517  *
   1518  * uchar.c maintains a tri-state flag for (not loaded/loaded/failed to load)
   1519  * and an error code for load failure.
   1520  * Instead, here we try to load at most once.
   1521  * If it works, we use the resulting singleton object.
   1522  * If it fails, then we get a dummy object, which always works unless
   1523  * we are seriously out of memory.
   1524  * After the first try, we have a never-changing pointer to either the
   1525  * real singleton or the dummy.
   1526  *
   1527  * This method is used in Unicode properties APIs (uchar.h) that
   1528  * do not have a service object and also do not have an error code parameter.
   1529  * Other API implementations get the singleton themselves
   1530  * (with mutexing), store it in the service object, and report errors.
   1531  *
   1532  * TODO:  Remove this support for non-hardcoded data.  u_init() is publicly
   1533  *        advertised as not being required for thread safety, we cannot
   1534  *        revert to unsafe data loading.
   1535  */
   1536 #if !UCASE_HARDCODE_DATA
   1537 #define GET_CASE_PROPS() (gCsp!=NULL ? gCsp : getCaseProps())
   1538 #else
   1539 #define GET_CASE_PROPS() &ucase_props_singleton
   1540 #endif
   1541 
   1542 /* public API (see uchar.h) */
   1543 
   1544 U_CAPI UBool U_EXPORT2
   1545 u_isULowercase(UChar32 c) {
   1546     return (UBool)(UCASE_LOWER==ucase_getType(GET_CASE_PROPS(), c));
   1547 }
   1548 
   1549 U_CAPI UBool U_EXPORT2
   1550 u_isUUppercase(UChar32 c) {
   1551     return (UBool)(UCASE_UPPER==ucase_getType(GET_CASE_PROPS(), c));
   1552 }
   1553 
   1554 /* Transforms the Unicode character to its lower case equivalent.*/
   1555 U_CAPI UChar32 U_EXPORT2
   1556 u_tolower(UChar32 c) {
   1557     return ucase_tolower(GET_CASE_PROPS(), c);
   1558 }
   1559 
   1560 /* Transforms the Unicode character to its upper case equivalent.*/
   1561 U_CAPI UChar32 U_EXPORT2
   1562 u_toupper(UChar32 c) {
   1563     return ucase_toupper(GET_CASE_PROPS(), c);
   1564 }
   1565 
   1566 /* Transforms the Unicode character to its title case equivalent.*/
   1567 U_CAPI UChar32 U_EXPORT2
   1568 u_totitle(UChar32 c) {
   1569     return ucase_totitle(GET_CASE_PROPS(), c);
   1570 }
   1571 
   1572 /* return the simple case folding mapping for c */
   1573 U_CAPI UChar32 U_EXPORT2
   1574 u_foldCase(UChar32 c, uint32_t options) {
   1575     return ucase_fold(GET_CASE_PROPS(), c, options);
   1576 }
   1577 
   1578 U_CFUNC int32_t U_EXPORT2
   1579 ucase_hasBinaryProperty(UChar32 c, UProperty which) {
   1580     /* case mapping properties */
   1581     const UChar *resultString;
   1582     int32_t locCache;
   1583     const UCaseProps *csp=GET_CASE_PROPS();
   1584     if(csp==NULL) {
   1585         return FALSE;
   1586     }
   1587     switch(which) {
   1588     case UCHAR_LOWERCASE:
   1589         return (UBool)(UCASE_LOWER==ucase_getType(csp, c));
   1590     case UCHAR_UPPERCASE:
   1591         return (UBool)(UCASE_UPPER==ucase_getType(csp, c));
   1592     case UCHAR_SOFT_DOTTED:
   1593         return ucase_isSoftDotted(csp, c);
   1594     case UCHAR_CASE_SENSITIVE:
   1595         return ucase_isCaseSensitive(csp, c);
   1596     case UCHAR_CASED:
   1597         return (UBool)(UCASE_NONE!=ucase_getType(csp, c));
   1598     case UCHAR_CASE_IGNORABLE:
   1599         return (UBool)(ucase_getTypeOrIgnorable(csp, c)>>2);
   1600     /*
   1601      * Note: The following Changes_When_Xyz are defined as testing whether
   1602      * the NFD form of the input changes when Xyz-case-mapped.
   1603      * However, this simpler implementation of these properties,
   1604      * ignoring NFD, passes the tests.
   1605      * The implementation needs to be changed if the tests start failing.
   1606      * When that happens, optimizations should be used to work with the
   1607      * per-single-code point ucase_toFullXyz() functions unless
   1608      * the NFD form has more than one code point,
   1609      * and the property starts set needs to be the union of the
   1610      * start sets for normalization and case mappings.
   1611      */
   1612     case UCHAR_CHANGES_WHEN_LOWERCASED:
   1613         locCache=UCASE_LOC_ROOT;
   1614         return (UBool)(ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
   1615     case UCHAR_CHANGES_WHEN_UPPERCASED:
   1616         locCache=UCASE_LOC_ROOT;
   1617         return (UBool)(ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
   1618     case UCHAR_CHANGES_WHEN_TITLECASED:
   1619         locCache=UCASE_LOC_ROOT;
   1620         return (UBool)(ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
   1621     /* case UCHAR_CHANGES_WHEN_CASEFOLDED: -- in uprops.c */
   1622     case UCHAR_CHANGES_WHEN_CASEMAPPED:
   1623         locCache=UCASE_LOC_ROOT;
   1624         return (UBool)(
   1625             ucase_toFullLower(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
   1626             ucase_toFullUpper(csp, c, NULL, NULL, &resultString, "", &locCache)>=0 ||
   1627             ucase_toFullTitle(csp, c, NULL, NULL, &resultString, "", &locCache)>=0);
   1628     default:
   1629         return FALSE;
   1630     }
   1631 }
   1632