Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2001-2012, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucol_bld.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created 02/22/2001
     14 *   created by: Vladimir Weinstein
     15 *
     16 * This module builds a collator based on the rule set.
     17 *
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 
     22 #if !UCONFIG_NO_COLLATION
     23 
     24 #include "unicode/ucoleitr.h"
     25 #include "unicode/udata.h"
     26 #include "unicode/uchar.h"
     27 #include "unicode/uniset.h"
     28 #include "unicode/uscript.h"
     29 #include "unicode/ustring.h"
     30 #include "unicode/utf16.h"
     31 #include "normalizer2impl.h"
     32 #include "ucol_bld.h"
     33 #include "ucol_elm.h"
     34 #include "ucol_cnt.h"
     35 #include "ucln_in.h"
     36 #include "umutex.h"
     37 #include "cmemory.h"
     38 #include "cstring.h"
     39 
     40 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     41 
     42 static const InverseUCATableHeader* _staticInvUCA = NULL;
     43 static UDataMemory* invUCA_DATA_MEM = NULL;
     44 
     45 U_CDECL_BEGIN
     46 static UBool U_CALLCONV
     47 isAcceptableInvUCA(void * /*context*/,
     48                    const char * /*type*/, const char * /*name*/,
     49                    const UDataInfo *pInfo)
     50 {
     51     /* context, type & name are intentionally not used */
     52     if( pInfo->size>=20 &&
     53         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
     54         pInfo->charsetFamily==U_CHARSET_FAMILY &&
     55         pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
     56         pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
     57         pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
     58         pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
     59         pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
     60         pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
     61         //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
     62         //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
     63         //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
     64         )
     65     {
     66         UVersionInfo UCDVersion;
     67         u_getUnicodeVersion(UCDVersion);
     68         return (pInfo->dataVersion[0]==UCDVersion[0] &&
     69             pInfo->dataVersion[1]==UCDVersion[1]);
     70             //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
     71             //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
     72             //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
     73     } else {
     74         return FALSE;
     75     }
     76 }
     77 U_CDECL_END
     78 
     79 /*
     80 * Takes two CEs (lead and continuation) and
     81 * compares them as CEs should be compared:
     82 * primary vs. primary, secondary vs. secondary
     83 * tertiary vs. tertiary
     84 */
     85 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
     86     uint32_t s1 = source0, s2, t1 = target0, t2;
     87     if(isContinuation(source1)) {
     88         s2 = source1;
     89     } else {
     90         s2 = 0;
     91     }
     92     if(isContinuation(target1)) {
     93         t2 = target1;
     94     } else {
     95         t2 = 0;
     96     }
     97 
     98     uint32_t s = 0, t = 0;
     99     if(s1 == t1 && s2 == t2) {
    100         return 0;
    101     }
    102     s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
    103     t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
    104     if(s < t) {
    105         return -1;
    106     } else if(s > t) {
    107         return 1;
    108     } else {
    109         s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
    110         t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
    111         if(s < t) {
    112             return -1;
    113         } else if(s > t) {
    114             return 1;
    115         } else {
    116             s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
    117             t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
    118             if(s < t) {
    119                 return -1;
    120             } else {
    121                 return 1;
    122             }
    123         }
    124     }
    125 }
    126 
    127 static
    128 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
    129     uint32_t bottom = 0, top = src->invUCA->tableSize;
    130     uint32_t i = 0;
    131     uint32_t first = 0, second = 0;
    132     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    133     int32_t res = 0;
    134 
    135     while(bottom < top-1) {
    136         i = (top+bottom)/2;
    137         first = *(CETable+3*i);
    138         second = *(CETable+3*i+1);
    139         res = compareCEs(first, second, CE, SecondCE);
    140         if(res > 0) {
    141             top = i;
    142         } else if(res < 0) {
    143             bottom = i;
    144         } else {
    145             break;
    146         }
    147     }
    148 
    149     /* weiv:                                                  */
    150     /* in searching for elements, I have removed the failure  */
    151     /* The reason for this is that the builder does not rely  */
    152     /* on search mechanism telling it that it didn't find an  */
    153     /* element. However, indirect positioning relies on being */
    154     /* able to find the elements around any CE, even if it is */
    155     /* not defined in the UCA. */
    156     return i;
    157     /*
    158     if((first == CE && second == SecondCE)) {
    159     return i;
    160     } else {
    161     return -1;
    162     }
    163     */
    164 }
    165 
    166 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
    167     0xFFFF0000,
    168     0xFFFFFF00,
    169     0xFFFFFFFF
    170 };
    171 
    172 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
    173                                             uint32_t CE, uint32_t contCE,
    174                                             uint32_t *nextCE, uint32_t *nextContCE,
    175                                             uint32_t strength)
    176 {
    177     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    178     int32_t iCE;
    179 
    180     iCE = ucol_inv_findCE(src, CE, contCE);
    181 
    182     if(iCE<0) {
    183         *nextCE = UCOL_NOT_FOUND;
    184         return -1;
    185     }
    186 
    187     CE &= strengthMask[strength];
    188     contCE &= strengthMask[strength];
    189 
    190     *nextCE = CE;
    191     *nextContCE = contCE;
    192 
    193     while((*nextCE  & strengthMask[strength]) == CE
    194         && (*nextContCE  & strengthMask[strength]) == contCE)
    195     {
    196         *nextCE = (*(CETable+3*(++iCE)));
    197         *nextContCE = (*(CETable+3*(iCE)+1));
    198     }
    199 
    200     return iCE;
    201 }
    202 
    203 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
    204                                             uint32_t CE, uint32_t contCE,
    205                                             uint32_t *prevCE, uint32_t *prevContCE,
    206                                             uint32_t strength)
    207 {
    208     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    209     int32_t iCE;
    210 
    211     iCE = ucol_inv_findCE(src, CE, contCE);
    212 
    213     if(iCE<0) {
    214         *prevCE = UCOL_NOT_FOUND;
    215         return -1;
    216     }
    217 
    218     CE &= strengthMask[strength];
    219     contCE &= strengthMask[strength];
    220 
    221     *prevCE = CE;
    222     *prevContCE = contCE;
    223 
    224     while((*prevCE  & strengthMask[strength]) == CE
    225         && (*prevContCE  & strengthMask[strength])== contCE
    226         && iCE > 0) /* this condition should prevent falling off the edge of the world */
    227     {
    228         /* here, we end up in a singularity - zero */
    229         *prevCE = (*(CETable+3*(--iCE)));
    230         *prevContCE = (*(CETable+3*(iCE)+1));
    231     }
    232 
    233     return iCE;
    234 }
    235 
    236 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
    237                                                        uint32_t prevCE, uint32_t prevContCE)
    238 {
    239     if(prevCE == CE && prevContCE == contCE) {
    240         return UCOL_IDENTICAL;
    241     }
    242     if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
    243         || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
    244     {
    245         return UCOL_PRIMARY;
    246     }
    247     if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
    248         || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
    249     {
    250         return UCOL_SECONDARY;
    251     }
    252     return UCOL_TERTIARY;
    253 }
    254 
    255 
    256 /*static
    257 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
    258 
    259     uint32_t CE = lh->baseCE;
    260     uint32_t SecondCE = lh->baseContCE;
    261 
    262     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    263     uint32_t previousCE, previousContCE;
    264     int32_t iCE;
    265 
    266     iCE = ucol_inv_findCE(src, CE, SecondCE);
    267 
    268     if(iCE<0) {
    269         return -1;
    270     }
    271 
    272     CE &= strengthMask[strength];
    273     SecondCE &= strengthMask[strength];
    274 
    275     previousCE = CE;
    276     previousContCE = SecondCE;
    277 
    278     while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
    279         previousCE = (*(CETable+3*(--iCE)));
    280         previousContCE = (*(CETable+3*(iCE)+1));
    281     }
    282     lh->previousCE = previousCE;
    283     lh->previousContCE = previousContCE;
    284 
    285     return iCE;
    286 }*/
    287 
    288 static
    289 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
    290     uint32_t CE = lh->baseCE;
    291     uint32_t SecondCE = lh->baseContCE;
    292 
    293     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    294     uint32_t nextCE, nextContCE;
    295     int32_t iCE;
    296 
    297     iCE = ucol_inv_findCE(src, CE, SecondCE);
    298 
    299     if(iCE<0) {
    300         return -1;
    301     }
    302 
    303     CE &= strengthMask[strength];
    304     SecondCE &= strengthMask[strength];
    305 
    306     nextCE = CE;
    307     nextContCE = SecondCE;
    308 
    309     while((nextCE  & strengthMask[strength]) == CE
    310         && (nextContCE  & strengthMask[strength]) == SecondCE)
    311     {
    312         nextCE = (*(CETable+3*(++iCE)));
    313         nextContCE = (*(CETable+3*(iCE)+1));
    314     }
    315 
    316     lh->nextCE = nextCE;
    317     lh->nextContCE = nextContCE;
    318 
    319     return iCE;
    320 }
    321 
    322 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
    323     /* reset all the gaps */
    324     int32_t i = 0;
    325     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    326     uint32_t st = 0;
    327     uint32_t t1, t2;
    328     int32_t pos;
    329 
    330     UColToken *tok = lh->first;
    331     uint32_t tokStrength = tok->strength;
    332 
    333     for(i = 0; i<3; i++) {
    334         lh->gapsHi[3*i] = 0;
    335         lh->gapsHi[3*i+1] = 0;
    336         lh->gapsHi[3*i+2] = 0;
    337         lh->gapsLo[3*i] = 0;
    338         lh->gapsLo[3*i+1] = 0;
    339         lh->gapsLo[3*i+2] = 0;
    340         lh->numStr[i] = 0;
    341         lh->fStrToken[i] = NULL;
    342         lh->lStrToken[i] = NULL;
    343         lh->pos[i] = -1;
    344     }
    345 
    346     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
    347 
    348     if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
    349         //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
    350         lh->pos[0] = 0;
    351         t1 = lh->baseCE;
    352         t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
    353         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    354         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    355         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    356         uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
    357         primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
    358 
    359         t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
    360         t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
    361 
    362         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    363         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    364         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    365     } else if(lh->indirect == TRUE && lh->nextCE != 0) {
    366         //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
    367         lh->pos[0] = 0;
    368         t1 = lh->baseCE;
    369         t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
    370         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    371         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    372         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    373         t1 = lh->nextCE;
    374         t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
    375         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    376         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    377         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    378     } else {
    379         for(;;) {
    380             if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
    381                 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
    382                     lh->fStrToken[tokStrength] = tok;
    383                 } else { /* The CE must be implicit, since it's not in the table */
    384                     /* Error */
    385                     *status = U_INTERNAL_PROGRAM_ERROR;
    386                 }
    387             }
    388 
    389             while(tok != NULL && tok->strength >= tokStrength) {
    390                 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
    391                     lh->lStrToken[tokStrength] = tok;
    392                 }
    393                 tok = tok->next;
    394             }
    395             if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
    396                 /* check if previous interval is the same and merge the intervals if it is so */
    397                 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
    398                     lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
    399                     lh->fStrToken[tokStrength+1] = NULL;
    400                     lh->lStrToken[tokStrength+1] = NULL;
    401                     lh->pos[tokStrength+1] = -1;
    402                 }
    403             }
    404             if(tok != NULL) {
    405                 tokStrength = tok->strength;
    406             } else {
    407                 break;
    408             }
    409         }
    410         for(st = 0; st < 3; st++) {
    411             if((pos = lh->pos[st]) >= 0) {
    412                 t1 = *(CETable+3*(pos));
    413                 t2 = *(CETable+3*(pos)+1);
    414                 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    415                 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    416                 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    417                 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
    418                 //pos--;
    419                 //t1 = *(CETable+3*(pos));
    420                 //t2 = *(CETable+3*(pos)+1);
    421                 t1 = lh->baseCE;
    422                 t2 = lh->baseContCE;
    423                 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    424                 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    425                 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
    426             }
    427         }
    428     }
    429 }
    430 
    431 
    432 #define ucol_countBytes(value, noOfBytes)   \
    433 {                               \
    434     uint32_t mask = 0xFFFFFFFF;   \
    435     (noOfBytes) = 0;              \
    436     while(mask != 0) {            \
    437     if(((value) & mask) != 0) { \
    438     (noOfBytes)++;            \
    439     }                           \
    440     mask >>= 8;                 \
    441     }                             \
    442 }
    443 
    444 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
    445     if(U_SUCCESS(*status)) {
    446         g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
    447     }
    448     return g->current;
    449 }
    450 
    451 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
    452     /* TODO: rename to enum names */
    453     uint32_t high, low, count=1;
    454     uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
    455 
    456     if(strength == UCOL_SECONDARY) {
    457         low = UCOL_COMMON_TOP2<<24;
    458         high = 0xFFFFFFFF;
    459         count = 0xFF - UCOL_COMMON_TOP2;
    460     } else {
    461         low = UCOL_BYTE_COMMON << 24; //0x05000000;
    462         high = 0x40000000;
    463         count = 0x40 - UCOL_BYTE_COMMON;
    464     }
    465 
    466     if(tok->next != NULL && tok->next->strength == strength) {
    467         count = tok->next->toInsert;
    468     }
    469 
    470     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
    471     g->current = UCOL_BYTE_COMMON<<24;
    472 
    473     if(g->noOfRanges == 0) {
    474         *status = U_INTERNAL_PROGRAM_ERROR;
    475     }
    476     return g->current;
    477 }
    478 
    479 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
    480     uint32_t strength = tok->strength;
    481     uint32_t low = lows[fStrength*3+strength];
    482     uint32_t high = highs[fStrength*3+strength];
    483     uint32_t maxByte = 0;
    484     if(strength == UCOL_TERTIARY) {
    485         maxByte = 0x3F;
    486     } else if(strength == UCOL_PRIMARY) {
    487         maxByte = 0xFE;
    488     } else {
    489         maxByte = 0xFF;
    490     }
    491 
    492     uint32_t count = tok->toInsert;
    493 
    494     if(low >= high && strength > UCOL_PRIMARY) {
    495         int32_t s = strength;
    496         for(;;) {
    497             s--;
    498             if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
    499                 if(strength == UCOL_SECONDARY) {
    500                     if (low < UCOL_COMMON_TOP2<<24 ) {
    501                        // Override if low range is less than UCOL_COMMON_TOP2.
    502 		        low = UCOL_COMMON_TOP2<<24;
    503                     }
    504                     high = 0xFFFFFFFF;
    505                 } else {
    506                     // Override if low range is less than UCOL_COMMON_BOT3.
    507 		    if ( low < UCOL_COMMON_BOT3<<24 ) {
    508                         low = UCOL_COMMON_BOT3<<24;
    509 		    }
    510                     high = 0x40000000;
    511                 }
    512                 break;
    513             }
    514             if(s<0) {
    515                 *status = U_INTERNAL_PROGRAM_ERROR;
    516                 return 0;
    517             }
    518         }
    519     }
    520 
    521     if(low < 0x02000000) {
    522         // We must not use CE weight byte 02, so we set it as the minimum lower bound.
    523         // See http://site.icu-project.org/design/collation/bytes
    524         low = 0x02000000;
    525     }
    526 
    527     if(strength == UCOL_SECONDARY) { /* similar as simple */
    528         if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
    529             low = UCOL_COMMON_TOP2<<24;
    530         }
    531         if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
    532             high = UCOL_COMMON_TOP2<<24;
    533         }
    534         if(low < (UCOL_COMMON_BOT2<<24)) {
    535             g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
    536             g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
    537             //g->current = UCOL_COMMON_BOT2<<24;
    538             return g->current;
    539         }
    540     }
    541 
    542     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
    543     if(g->noOfRanges == 0) {
    544         *status = U_INTERNAL_PROGRAM_ERROR;
    545     }
    546     g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
    547     return g->current;
    548 }
    549 
    550 static
    551 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
    552     uint32_t i = 0;
    553     UChar c;
    554 
    555     if(U_FAILURE(*status)) {
    556         return 0;
    557     }
    558 
    559     if(sourceLen > resLen) {
    560         *status = U_MEMORY_ALLOCATION_ERROR;
    561         return 0;
    562     }
    563 
    564     for(i = 0; i < sourceLen; i++) {
    565         c = source[i];
    566         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
    567             switch(c - 0x3000) {
    568             case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
    569             case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
    570                 c++;
    571                 break;
    572             case 0xF5:
    573                 c = 0x30AB;
    574                 break;
    575             case 0xF6:
    576                 c = 0x30B1;
    577                 break;
    578             }
    579         }
    580         resBuf[i] = c;
    581     }
    582     return sourceLen;
    583 }
    584 
    585 static
    586 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
    587     uint32_t i = 0;
    588     UChar c;
    589 
    590     if(U_FAILURE(*status)) {
    591         return 0;
    592     }
    593 
    594     if(sourceLen > resLen) {
    595         *status = U_MEMORY_ALLOCATION_ERROR;
    596         return 0;
    597     }
    598 
    599     for(i = 0; i < sourceLen; i++) {
    600         c = source[i];
    601         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
    602             switch(c - 0x3000) {
    603             case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
    604             case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
    605                 c--;
    606                 break;
    607             case 0xAB:
    608                 c = 0x30F5;
    609                 break;
    610             case 0xB1:
    611                 c = 0x30F6;
    612                 break;
    613             }
    614         }
    615         resBuf[i] = c;
    616     }
    617     return sourceLen;
    618 }
    619 
    620 U_NAMESPACE_BEGIN
    621 
    622 static
    623 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
    624     uint32_t i = 0;
    625     UChar n[128];
    626     uint32_t nLen = 0;
    627     uint32_t uCount = 0, lCount = 0;
    628 
    629     collIterate s;
    630     uint32_t order = 0;
    631 
    632     if(U_FAILURE(*status)) {
    633         return UCOL_LOWER_CASE;
    634     }
    635 
    636     nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
    637     if(U_SUCCESS(*status)) {
    638         for(i = 0; i < nLen; i++) {
    639             uprv_init_collIterate(UCA, &n[i], 1, &s, status);
    640             order = ucol_getNextCE(UCA, &s, status);
    641             if(isContinuation(order)) {
    642                 *status = U_INTERNAL_PROGRAM_ERROR;
    643                 return UCOL_LOWER_CASE;
    644             }
    645             if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
    646                 uCount++;
    647             } else {
    648                 if(u_islower(n[i])) {
    649                     lCount++;
    650                 } else if(U_SUCCESS(*status)) {
    651                     UChar sk[1], lk[1];
    652                     u_toSmallKana(&n[i], 1, sk, 1, status);
    653                     u_toLargeKana(&n[i], 1, lk, 1, status);
    654                     if(sk[0] == n[i] && lk[0] != n[i]) {
    655                         lCount++;
    656                     }
    657                 }
    658             }
    659         }
    660     }
    661 
    662     if(uCount != 0 && lCount != 0) {
    663         return UCOL_MIXED_CASE;
    664     } else if(uCount != 0) {
    665         return UCOL_UPPER_CASE;
    666     } else {
    667         return UCOL_LOWER_CASE;
    668     }
    669 }
    670 
    671 
    672 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
    673     /* this one makes the table and stuff */
    674     uint32_t noOfBytes[3];
    675     uint32_t i;
    676 
    677     for(i = 0; i<3; i++) {
    678         ucol_countBytes(CEparts[i], noOfBytes[i]);
    679     }
    680 
    681     /* Here we have to pack CEs from parts */
    682 
    683     uint32_t CEi = 0;
    684     uint32_t value = 0;
    685 
    686     while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
    687         if(CEi > 0) {
    688             value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
    689         } else {
    690             value = 0;
    691         }
    692 
    693         if(2*CEi<noOfBytes[0]) {
    694             value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
    695         }
    696         if(CEi<noOfBytes[1]) {
    697             value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
    698         }
    699         if(CEi<noOfBytes[2]) {
    700             value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
    701         }
    702         tok->CEs[CEi] = value;
    703         CEi++;
    704     }
    705     if(CEi == 0) { /* totally ignorable */
    706         tok->noOfCEs = 1;
    707         tok->CEs[0] = 0;
    708     } else { /* there is at least something */
    709         tok->noOfCEs = CEi;
    710     }
    711 
    712 
    713     // we want to set case bits here and now, not later.
    714     // Case bits handling
    715     if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
    716         tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
    717         int32_t cSize = (tok->source & 0xFF000000) >> 24;
    718         UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
    719 
    720         if(cSize > 1) {
    721             // Do it manually
    722             tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
    723         } else {
    724             // Copy it from the UCA
    725             uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
    726             tok->CEs[0] |= (caseCE & 0xC0);
    727         }
    728     }
    729 
    730 #if UCOL_DEBUG==2
    731     fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
    732     for(i = 0; i<tok->noOfCEs; i++) {
    733         fprintf(stderr, "%08X ", tok->CEs[i]);
    734     }
    735     fprintf(stderr, "\n");
    736 #endif
    737 }
    738 
    739 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
    740     ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
    741     uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
    742 
    743     UColToken *tok = lh->last;
    744     uint32_t t[UCOL_STRENGTH_LIMIT];
    745 
    746     uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
    747 
    748     /* must initialize ranges to avoid memory check warnings */
    749     for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) {
    750         uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges));
    751     }
    752 
    753     tok->toInsert = 1;
    754     t[tok->strength] = 1;
    755 
    756     while(tok->previous != NULL) {
    757         if(tok->previous->strength < tok->strength) { /* going up */
    758             t[tok->strength] = 0;
    759             t[tok->previous->strength]++;
    760         } else if(tok->previous->strength > tok->strength) { /* going down */
    761             t[tok->previous->strength] = 1;
    762         } else {
    763             t[tok->strength]++;
    764         }
    765         tok=tok->previous;
    766         tok->toInsert = t[tok->strength];
    767     }
    768 
    769     tok->toInsert = t[tok->strength];
    770     ucol_inv_getGapPositions(src, lh, status);
    771 
    772 #if UCOL_DEBUG
    773     fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
    774     int32_t j = 2;
    775     for(j = 2; j >= 0; j--) {
    776         fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
    777         fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
    778     }
    779     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
    780 
    781     do {
    782         fprintf(stderr,"%i", tok->strength);
    783         tok = tok->next;
    784     } while(tok != NULL);
    785     fprintf(stderr, "\n");
    786 
    787     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
    788 
    789     do {
    790         fprintf(stderr,"%i", tok->toInsert);
    791         tok = tok->next;
    792     } while(tok != NULL);
    793 #endif
    794 
    795     tok = lh->first;
    796     uint32_t fStrength = UCOL_IDENTICAL;
    797     uint32_t initStrength = UCOL_IDENTICAL;
    798 
    799 
    800     CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
    801     CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
    802     CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
    803 
    804     while (tok != NULL && U_SUCCESS(*status)) {
    805         fStrength = tok->strength;
    806         if(fStrength < initStrength) {
    807             initStrength = fStrength;
    808             if(lh->pos[fStrength] == -1) {
    809                 while(lh->pos[fStrength] == -1 && fStrength > 0) {
    810                     fStrength--;
    811                 }
    812                 if(lh->pos[fStrength] == -1) {
    813                     *status = U_INTERNAL_PROGRAM_ERROR;
    814                     return;
    815                 }
    816             }
    817             if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
    818                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
    819                 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
    820                 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
    821                 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
    822             } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
    823                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
    824                 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
    825                 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
    826                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    827             } else { /* primaries */
    828                 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
    829                 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
    830                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
    831                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    832             }
    833         } else {
    834             if(tok->strength == UCOL_TERTIARY) {
    835                 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
    836             } else if(tok->strength == UCOL_SECONDARY) {
    837                 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
    838                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    839             } else if(tok->strength == UCOL_PRIMARY) {
    840                 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
    841                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
    842                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    843             }
    844         }
    845         ucol_doCE(src, CEparts, tok, status);
    846         tok = tok->next;
    847     }
    848 }
    849 
    850 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
    851     UCAElements el;
    852     UColToken *tok = lh->first;
    853     UColToken *expt = NULL;
    854     uint32_t i = 0, j = 0;
    855     const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
    856 
    857     while(tok != NULL && U_SUCCESS(*status)) {
    858         /* first, check if there are any expansions */
    859         /* if there are expansions, we need to do a little bit more processing */
    860         /* since parts of expansion can be tailored, while others are not */
    861         if(tok->expansion != 0) {
    862             uint32_t len = tok->expansion >> 24;
    863             uint32_t currentSequenceLen = len;
    864             uint32_t expOffset = tok->expansion & 0x00FFFFFF;
    865             //uint32_t exp = currentSequenceLen | expOffset;
    866             UColToken exp;
    867             exp.source = currentSequenceLen | expOffset;
    868             exp.rulesToParseHdl = &(src->source);
    869 
    870             while(len > 0) {
    871                 currentSequenceLen = len;
    872                 while(currentSequenceLen > 0) {
    873                     exp.source = (currentSequenceLen << 24) | expOffset;
    874                     if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
    875                         uint32_t noOfCEsToCopy = expt->noOfCEs;
    876                         for(j = 0; j<noOfCEsToCopy; j++) {
    877                             tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
    878                         }
    879                         tok->noOfExpCEs += noOfCEsToCopy;
    880                         // Smart people never try to add codepoints and CEs.
    881                         // For some odd reason, it won't work.
    882                         expOffset += currentSequenceLen; //noOfCEsToCopy;
    883                         len -= currentSequenceLen; //noOfCEsToCopy;
    884                         break;
    885                     } else {
    886                         currentSequenceLen--;
    887                     }
    888                 }
    889                 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
    890                     /* will have to get one from UCA */
    891                     /* first, get the UChars from the rules */
    892                     /* then pick CEs out until there is no more and stuff them into expansion */
    893                     collIterate s;
    894                     uint32_t order = 0;
    895                     uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
    896 
    897                     for(;;) {
    898                         order = ucol_getNextCE(src->UCA, &s, status);
    899                         if(order == UCOL_NO_MORE_CES) {
    900                             break;
    901                         }
    902                         tok->expCEs[tok->noOfExpCEs++] = order;
    903                     }
    904                     expOffset++;
    905                     len--;
    906                 }
    907             }
    908         } else {
    909             tok->noOfExpCEs = 0;
    910         }
    911 
    912         /* set the ucaelement with obtained values */
    913         el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
    914         /* copy CEs */
    915         for(i = 0; i<tok->noOfCEs; i++) {
    916             el.CEs[i] = tok->CEs[i];
    917         }
    918         for(i = 0; i<tok->noOfExpCEs; i++) {
    919             el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
    920         }
    921 
    922         /* copy UChars */
    923         // We kept prefix and source kind of together, as it is a kind of a contraction.
    924         // However, now we have to slice the prefix off the main thing -
    925         el.prefix = el.prefixChars;
    926         el.cPoints = el.uchars;
    927         if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
    928             // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
    929             // decomposed elements to the unsaf table.
    930             el.prefixSize = tok->prefix>>24;
    931             uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
    932 
    933             el.cSize = (tok->source >> 24)-(tok->prefix>>24);
    934             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
    935         } else {
    936             el.prefixSize = 0;
    937             *el.prefix = 0;
    938 
    939             el.cSize = (tok->source >> 24);
    940             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
    941         }
    942         if(src->UCA != NULL) {
    943             for(i = 0; i<el.cSize; i++) {
    944                 if(UCOL_ISJAMO(el.cPoints[i])) {
    945                     t->image->jamoSpecial = TRUE;
    946                 }
    947             }
    948             if (!src->buildCCTabFlag && el.cSize > 0) {
    949                 // Check the trailing canonical combining class (tccc) of the last character.
    950                 const UChar *s = el.cPoints + el.cSize;
    951                 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
    952                 if ((fcd & 0xff) != 0) {
    953                     src->buildCCTabFlag = TRUE;
    954                 }
    955             }
    956         }
    957 
    958         /* and then, add it */
    959 #if UCOL_DEBUG==2
    960         fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
    961 #endif
    962         uprv_uca_addAnElement(t, &el, status);
    963 
    964 #if UCOL_DEBUG_DUPLICATES
    965         if(*status != U_ZERO_ERROR) {
    966             fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
    967             *status = U_ZERO_ERROR;
    968         }
    969 #endif
    970 
    971         tok = tok->next;
    972     }
    973 }
    974 
    975 U_CDECL_BEGIN
    976 static UBool U_CALLCONV
    977 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
    978     UErrorCode status = U_ZERO_ERROR;
    979     tempUCATable *t = (tempUCATable *)context;
    980     if(value == 0) {
    981         while(start < limit) {
    982             uint32_t CE = utrie_get32(t->mapping, start, NULL);
    983             if(CE == UCOL_NOT_FOUND) {
    984                 UCAElements el;
    985                 el.isThai = FALSE;
    986                 el.prefixSize = 0;
    987                 el.prefixChars[0] = 0;
    988                 el.prefix = el.prefixChars;
    989                 el.cPoints = el.uchars;
    990 
    991                 el.cSize = 0;
    992                 U16_APPEND_UNSAFE(el.uchars, el.cSize, start);
    993 
    994                 el.noOfCEs = 1;
    995                 el.CEs[0] = 0;
    996                 uprv_uca_addAnElement(t, &el, &status);
    997 
    998             }
    999             start++;
   1000         }
   1001     }
   1002     if(U_FAILURE(status)) {
   1003         return FALSE;
   1004     } else {
   1005         return TRUE;
   1006     }
   1007 }
   1008 U_CDECL_END
   1009 
   1010 static void
   1011 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
   1012                                UChar32 start, UChar32 end,
   1013                                UErrorCode *status)
   1014 {
   1015     //UChar decomp[256];
   1016     uint32_t CE = UCOL_NOT_FOUND;
   1017     UChar32 u = 0;
   1018     UCAElements el;
   1019     el.isThai = FALSE;
   1020     el.prefixSize = 0;
   1021     el.prefixChars[0] = 0;
   1022     collIterate colIt;
   1023 
   1024     if(U_SUCCESS(*status)) {
   1025         for(u = start; u<=end; u++) {
   1026             if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
   1027                 /* this test is for contractions that are missing the starting element. */
   1028                 || ((isCntTableElement(CE)) &&
   1029                 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
   1030                 )
   1031             {
   1032                 el.cSize = 0;
   1033                 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
   1034                 //decomp[0] = (UChar)u;
   1035                 //el.uchars[0] = (UChar)u;
   1036                 el.cPoints = el.uchars;
   1037                 //el.cSize = 1;
   1038                 el.noOfCEs = 0;
   1039                 el.prefix = el.prefixChars;
   1040                 el.prefixSize = 0;
   1041                 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
   1042                 // We actually want to check whether this element is a special
   1043                 // If it is an implicit element (hangul, CJK - we want to copy the
   1044                 // special, not the resolved CEs) - for hangul, copying resolved
   1045                 // would just make things the same (there is an expansion and it
   1046                 // takes approximately the same amount of time to resolve as
   1047                 // falling back to the UCA).
   1048                 /*
   1049                 UTRIE_GET32(src->UCA->mapping, u, CE);
   1050                 tag = getCETag(CE);
   1051                 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
   1052                 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
   1053                 || tag == LEAD_SURROGATE_TAG) {
   1054                 el.CEs[el.noOfCEs++] = CE;
   1055                 } else {
   1056                 */
   1057                 // It turns out that it does not make sense to keep implicits
   1058                 // unresolved. The cost of resolving them is big enough so that
   1059                 // it doesn't make any difference whether we have to go to the UCA
   1060                 // or not.
   1061                 {
   1062                     uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
   1063                     while(CE != UCOL_NO_MORE_CES) {
   1064                         CE = ucol_getNextCE(src->UCA, &colIt, status);
   1065                         if(CE != UCOL_NO_MORE_CES) {
   1066                             el.CEs[el.noOfCEs++] = CE;
   1067                         }
   1068                     }
   1069                 }
   1070                 uprv_uca_addAnElement(t, &el, status);
   1071             }
   1072         }
   1073     }
   1074 }
   1075 
   1076 U_NAMESPACE_END
   1077 
   1078 U_CFUNC UCATableHeader *
   1079 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
   1080     U_NAMESPACE_USE
   1081 
   1082     uint32_t i = 0;
   1083     if(U_FAILURE(*status)) {
   1084         return NULL;
   1085     }
   1086     /*
   1087     2.  Eliminate the negative lists by doing the following for each non-null negative list:
   1088     o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
   1089     create new ListHeader X
   1090     o   reverse the list, add to the end of X's positive list. Reset the strength of the
   1091     first item you add, based on the stronger strength levels of the two lists.
   1092     */
   1093     /*
   1094     3.  For each ListHeader with a non-null positive list:
   1095     */
   1096     /*
   1097     o   Find all character strings with CEs between the baseCE and the
   1098     next/previous CE, at the strength of the first token. Add these to the
   1099     tailoring.
   1100     ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
   1101     tailoring has & x < z...
   1102     ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
   1103     */
   1104     /* It is possible that this part should be done even while constructing list */
   1105     /* The problem is that it is unknown what is going to be the strongest weight */
   1106     /* So we might as well do it here */
   1107 
   1108     /*
   1109     o   Allocate CEs for each token in the list, based on the total number N of the
   1110     largest level difference, and the gap G between baseCE and nextCE at that
   1111     level. The relation * between the last item and nextCE is the same as the
   1112     strongest strength.
   1113     o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
   1114     ? There are 3 primary items: a, d, e. Fit them into the primary gap.
   1115     Then fit b and c into the secondary gap between a and d, then fit q
   1116     into the tertiary gap between b and c.
   1117 
   1118     o   Example: baseCE << b <<< q << c * nextCE(X,2)
   1119     ? There are 2 secondary items: b, c. Fit them into the secondary gap.
   1120     Then fit q into the tertiary gap between b and c.
   1121     o   When incrementing primary values, we will not cross high byte
   1122     boundaries except where there is only a single-byte primary. That is to
   1123     ensure that the script reordering will continue to work.
   1124     */
   1125     UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
   1126     /* test for NULL */
   1127     if (image == NULL) {
   1128         *status = U_MEMORY_ALLOCATION_ERROR;
   1129         return NULL;
   1130     }
   1131     uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
   1132 
   1133     for(i = 0; i<src->resultLen; i++) {
   1134         /* now we need to generate the CEs */
   1135         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
   1136         /* According to strength                                                          */
   1137         if(U_SUCCESS(*status)) {
   1138             if(src->lh[i].first) { // if there are any elements
   1139                 // due to the way parser works, subsequent tailorings
   1140                 // may remove all the elements from a sequence, therefore
   1141                 // leaving an empty tailoring sequence.
   1142                 ucol_initBuffers(src, &src->lh[i], status);
   1143             }
   1144         }
   1145         if(U_FAILURE(*status)) {
   1146             uprv_free(image);
   1147             return NULL;
   1148         }
   1149     }
   1150 
   1151     if(src->varTop != NULL) { /* stuff the variable top value */
   1152         src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
   1153         /* remove it from the list */
   1154         if(src->varTop->listHeader->first == src->varTop) { /* first in list */
   1155             src->varTop->listHeader->first = src->varTop->next;
   1156         }
   1157         if(src->varTop->listHeader->last == src->varTop) { /* first in list */
   1158             src->varTop->listHeader->last = src->varTop->previous;
   1159         }
   1160         if(src->varTop->next != NULL) {
   1161             src->varTop->next->previous = src->varTop->previous;
   1162         }
   1163         if(src->varTop->previous != NULL) {
   1164             src->varTop->previous->next = src->varTop->next;
   1165         }
   1166     }
   1167 
   1168 
   1169     tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
   1170     if(U_FAILURE(*status)) {
   1171         uprv_free(image);
   1172         return NULL;
   1173     }
   1174 
   1175 
   1176     /* After this, we have assigned CE values to all regular CEs      */
   1177     /* now we will go through list once more and resolve expansions,  */
   1178     /* make UCAElements structs and add them to table                 */
   1179     for(i = 0; i<src->resultLen; i++) {
   1180         /* now we need to generate the CEs */
   1181         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
   1182         /* According to strength                                                          */
   1183         if(U_SUCCESS(*status)) {
   1184             ucol_createElements(src, t, &src->lh[i], status);
   1185         }
   1186     }
   1187 
   1188     UCAElements el;
   1189     el.isThai = FALSE;
   1190     el.prefixSize = 0;
   1191     el.prefixChars[0] = 0;
   1192 
   1193     /* add latin-1 stuff */
   1194     ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
   1195 
   1196     /* add stuff for copying */
   1197     if(src->copySet != NULL) {
   1198         int32_t i = 0;
   1199         UnicodeSet *set = (UnicodeSet *)src->copySet;
   1200         for(i = 0; i < set->getRangeCount(); i++) {
   1201             ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
   1202         }
   1203     }
   1204 
   1205     if(U_SUCCESS(*status)) {
   1206         /* copy contractions from the UCA - this is felt mostly for cyrillic*/
   1207 
   1208         uint32_t tailoredCE = UCOL_NOT_FOUND;
   1209         UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
   1210         int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth;
   1211         UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
   1212         // Check for null pointer
   1213         if (ucaEl == NULL) {
   1214             *status = U_MEMORY_ALLOCATION_ERROR;
   1215             return NULL;
   1216         }
   1217         while(*conts != 0) {
   1218             // A continuation is NUL-terminated and NUL-padded
   1219             // except if it has the maximum length.
   1220             int32_t contractionLength = maxUCAContractionLength;
   1221             while(contractionLength > 0 && conts[contractionLength - 1] == 0) {
   1222                 --contractionLength;
   1223             }
   1224             UChar32 first;
   1225             int32_t firstLength = 0;
   1226             U16_NEXT(conts, firstLength, contractionLength, first);
   1227             tailoredCE = utrie_get32(t->mapping, first, NULL);
   1228             if(tailoredCE != UCOL_NOT_FOUND) {
   1229                 UBool needToAdd = TRUE;
   1230                 if(isCntTableElement(tailoredCE)) {
   1231                     if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) {
   1232                         needToAdd = FALSE;
   1233                     }
   1234                 }
   1235                 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
   1236                     UCAElements elm;
   1237                     elm.cPoints = el.uchars;
   1238                     elm.noOfCEs = 0;
   1239                     elm.uchars[0] = *conts;
   1240                     elm.uchars[1] = 0;
   1241                     elm.cSize = 1;
   1242                     elm.prefixChars[0] = *(conts+2);
   1243                     elm.isThai = FALSE;
   1244                     elm.prefix = elm.prefixChars;
   1245                     elm.prefixSize = 1;
   1246                     UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
   1247                     if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
   1248                         needToAdd = TRUE;
   1249                     }
   1250                 }
   1251                 if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
   1252                     needToAdd = FALSE;
   1253                 }
   1254 
   1255                 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
   1256                     if (*(conts+1) != 0) {  // contractions
   1257                         el.prefix = el.prefixChars;
   1258                         el.prefixSize = 0;
   1259                         el.cPoints = el.uchars;
   1260                         el.noOfCEs = 0;
   1261                         u_memcpy(el.uchars, conts, contractionLength);
   1262                         el.cSize = contractionLength;
   1263                         ucol_setText(ucaEl, el.uchars, el.cSize, status);
   1264                     }
   1265                     else { // pre-context character
   1266                         UChar str[4] = { 0 };
   1267                         int32_t len=0;
   1268                         int32_t preKeyLen=0;
   1269 
   1270                         el.cPoints = el.uchars;
   1271                         el.noOfCEs = 0;
   1272                         el.uchars[0] = *conts;
   1273                         el.uchars[1] = 0;
   1274                         el.cSize = 1;
   1275                         el.prefixChars[0] = *(conts+2);
   1276                         el.prefix = el.prefixChars;
   1277                         el.prefixSize = 1;
   1278                         if (el.prefixChars[0]!=0) {
   1279                             // get CE of prefix character first
   1280                             str[0]=el.prefixChars[0];
   1281                             str[1]=0;
   1282                             ucol_setText(ucaEl, str, 1, status);
   1283                             while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
   1284                                     != UCOL_NULLORDER) {
   1285                                 preKeyLen++;  // count number of keys for prefix character
   1286                             }
   1287                             str[len++] = el.prefixChars[0];
   1288                         }
   1289 
   1290                         str[len++] = el.uchars[0];
   1291                         str[len]=0;
   1292                         ucol_setText(ucaEl, str, len, status);
   1293                         // Skip the keys for prefix character, then copy the rest to el.
   1294                         while ((preKeyLen-->0) &&
   1295                                (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
   1296                             continue;
   1297                         }
   1298 
   1299                     }
   1300                     while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
   1301                         el.noOfCEs++;
   1302                     }
   1303                     uprv_uca_addAnElement(t, &el, status);
   1304                 }
   1305 
   1306             } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
   1307                 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);
   1308             }
   1309             conts+=maxUCAContractionLength;
   1310         }
   1311         ucol_closeElements(ucaEl);
   1312     }
   1313 
   1314     // Add completely ignorable elements
   1315     utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
   1316 
   1317     // add tailoring characters related canonical closures
   1318     uprv_uca_canonicalClosure(t, src, NULL, status);
   1319 
   1320     /* still need to produce compatibility closure */
   1321 
   1322     UCATableHeader *myData = uprv_uca_assembleTable(t, status);
   1323 
   1324     uprv_uca_closeTempTable(t);
   1325     uprv_free(image);
   1326 
   1327     return myData;
   1328 }
   1329 
   1330 U_CDECL_BEGIN
   1331 static UBool U_CALLCONV
   1332 ucol_bld_cleanup(void)
   1333 {
   1334     udata_close(invUCA_DATA_MEM);
   1335     invUCA_DATA_MEM = NULL;
   1336     _staticInvUCA = NULL;
   1337     return TRUE;
   1338 }
   1339 U_CDECL_END
   1340 
   1341 U_CAPI const InverseUCATableHeader * U_EXPORT2
   1342 ucol_initInverseUCA(UErrorCode *status)
   1343 {
   1344     if(U_FAILURE(*status)) return NULL;
   1345 
   1346     UBool needsInit;
   1347     UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
   1348 
   1349     if(needsInit) {
   1350         InverseUCATableHeader *newInvUCA = NULL;
   1351         UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
   1352 
   1353         if(U_FAILURE(*status)) {
   1354             if (result) {
   1355                 udata_close(result);
   1356             }
   1357             // This is not needed, as we are talking about
   1358             // memory we got from UData
   1359             //uprv_free(newInvUCA);
   1360         }
   1361 
   1362         if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
   1363             newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
   1364             UCollator *UCA = ucol_initUCA(status);
   1365             // UCA versions of UCA and inverse UCA should match
   1366             if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
   1367                 *status = U_INVALID_FORMAT_ERROR;
   1368                 udata_close(result);
   1369                 return NULL;
   1370             }
   1371 
   1372             umtx_lock(NULL);
   1373             if(_staticInvUCA == NULL) {
   1374                 invUCA_DATA_MEM = result;
   1375                 _staticInvUCA = newInvUCA;
   1376                 result = NULL;
   1377                 newInvUCA = NULL;
   1378             }
   1379             umtx_unlock(NULL);
   1380 
   1381             if(newInvUCA != NULL) {
   1382                 udata_close(result);
   1383                 // This is not needed, as we are talking about
   1384                 // memory we got from UData
   1385                 //uprv_free(newInvUCA);
   1386             }
   1387             else {
   1388                 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
   1389             }
   1390         }
   1391     }
   1392     return _staticInvUCA;
   1393 }
   1394 
   1395 /* This is the data that is used for non-script reordering codes. These _must_ be kept
   1396  * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
   1397  */
   1398 static const char * const ReorderingTokenNames[] = {
   1399     "SPACE",
   1400     "PUNCT",
   1401     "SYMBOL",
   1402     "CURRENCY",
   1403     "DIGIT"
   1404 };
   1405 
   1406 static void toUpper(const char* src, char* dst, uint32_t length) {
   1407    for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
   1408        *dst = uprv_toupper(*src);
   1409    }
   1410    *dst = '\0';
   1411 }
   1412 
   1413 U_INTERNAL int32_t U_EXPORT2
   1414 ucol_findReorderingEntry(const char* name) {
   1415     char buffer[32];
   1416     toUpper(name, buffer, 32);
   1417     for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) {
   1418         if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
   1419             return entry + UCOL_REORDER_CODE_FIRST;
   1420         }
   1421     }
   1422     return USCRIPT_INVALID_CODE;
   1423 }
   1424 
   1425 #endif /* #if !UCONFIG_NO_COLLATION */
   1426