Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2001-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucol_bld.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created 02/22/2001
     14 *   created by: Vladimir Weinstein
     15 *
     16 * This module builds a collator based on the rule set.
     17 *
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 
     22 #if !UCONFIG_NO_COLLATION
     23 
     24 #include "unicode/ucoleitr.h"
     25 #include "unicode/udata.h"
     26 #include "unicode/uchar.h"
     27 #include "unicode/uniset.h"
     28 #include "unicode/uscript.h"
     29 #include "unicode/ustring.h"
     30 #include "normalizer2impl.h"
     31 #include "ucol_bld.h"
     32 #include "ucol_elm.h"
     33 #include "ucol_cnt.h"
     34 #include "ucln_in.h"
     35 #include "umutex.h"
     36 #include "cmemory.h"
     37 #include "cstring.h"
     38 
     39 U_NAMESPACE_BEGIN
     40 
     41 static const InverseUCATableHeader* _staticInvUCA = NULL;
     42 static UDataMemory* invUCA_DATA_MEM = NULL;
     43 
     44 U_CDECL_BEGIN
     45 static UBool U_CALLCONV
     46 isAcceptableInvUCA(void * /*context*/,
     47                    const char * /*type*/, const char * /*name*/,
     48                    const UDataInfo *pInfo)
     49 {
     50     /* context, type & name are intentionally not used */
     51     if( pInfo->size>=20 &&
     52         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
     53         pInfo->charsetFamily==U_CHARSET_FAMILY &&
     54         pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
     55         pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
     56         pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
     57         pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
     58         pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
     59         pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
     60         //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
     61         //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
     62         //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
     63         )
     64     {
     65         UVersionInfo UCDVersion;
     66         u_getUnicodeVersion(UCDVersion);
     67         return (pInfo->dataVersion[0]==UCDVersion[0] &&
     68             pInfo->dataVersion[1]==UCDVersion[1]);
     69             //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
     70             //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
     71             //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
     72     } else {
     73         return FALSE;
     74     }
     75 }
     76 U_CDECL_END
     77 
     78 /*
     79 * Takes two CEs (lead and continuation) and
     80 * compares them as CEs should be compared:
     81 * primary vs. primary, secondary vs. secondary
     82 * tertiary vs. tertiary
     83 */
     84 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
     85     uint32_t s1 = source0, s2, t1 = target0, t2;
     86     if(isContinuation(source1)) {
     87         s2 = source1;
     88     } else {
     89         s2 = 0;
     90     }
     91     if(isContinuation(target1)) {
     92         t2 = target1;
     93     } else {
     94         t2 = 0;
     95     }
     96 
     97     uint32_t s = 0, t = 0;
     98     if(s1 == t1 && s2 == t2) {
     99         return 0;
    100     }
    101     s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
    102     t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
    103     if(s < t) {
    104         return -1;
    105     } else if(s > t) {
    106         return 1;
    107     } else {
    108         s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
    109         t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
    110         if(s < t) {
    111             return -1;
    112         } else if(s > t) {
    113             return 1;
    114         } else {
    115             s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
    116             t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
    117             if(s < t) {
    118                 return -1;
    119             } else {
    120                 return 1;
    121             }
    122         }
    123     }
    124 }
    125 
    126 static
    127 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
    128     uint32_t bottom = 0, top = src->invUCA->tableSize;
    129     uint32_t i = 0;
    130     uint32_t first = 0, second = 0;
    131     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    132     int32_t res = 0;
    133 
    134     while(bottom < top-1) {
    135         i = (top+bottom)/2;
    136         first = *(CETable+3*i);
    137         second = *(CETable+3*i+1);
    138         res = compareCEs(first, second, CE, SecondCE);
    139         if(res > 0) {
    140             top = i;
    141         } else if(res < 0) {
    142             bottom = i;
    143         } else {
    144             break;
    145         }
    146     }
    147 
    148     /* weiv:                                                  */
    149     /* in searching for elements, I have removed the failure  */
    150     /* The reason for this is that the builder does not rely  */
    151     /* on search mechanism telling it that it didn't find an  */
    152     /* element. However, indirect positioning relies on being */
    153     /* able to find the elements around any CE, even if it is */
    154     /* not defined in the UCA. */
    155     return i;
    156     /*
    157     if((first == CE && second == SecondCE)) {
    158     return i;
    159     } else {
    160     return -1;
    161     }
    162     */
    163 }
    164 
    165 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
    166     0xFFFF0000,
    167     0xFFFFFF00,
    168     0xFFFFFFFF
    169 };
    170 
    171 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
    172                                             uint32_t CE, uint32_t contCE,
    173                                             uint32_t *nextCE, uint32_t *nextContCE,
    174                                             uint32_t strength)
    175 {
    176     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    177     int32_t iCE;
    178 
    179     iCE = ucol_inv_findCE(src, CE, contCE);
    180 
    181     if(iCE<0) {
    182         *nextCE = UCOL_NOT_FOUND;
    183         return -1;
    184     }
    185 
    186     CE &= strengthMask[strength];
    187     contCE &= strengthMask[strength];
    188 
    189     *nextCE = CE;
    190     *nextContCE = contCE;
    191 
    192     while((*nextCE  & strengthMask[strength]) == CE
    193         && (*nextContCE  & strengthMask[strength]) == contCE)
    194     {
    195         *nextCE = (*(CETable+3*(++iCE)));
    196         *nextContCE = (*(CETable+3*(iCE)+1));
    197     }
    198 
    199     return iCE;
    200 }
    201 
    202 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
    203                                             uint32_t CE, uint32_t contCE,
    204                                             uint32_t *prevCE, uint32_t *prevContCE,
    205                                             uint32_t strength)
    206 {
    207     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    208     int32_t iCE;
    209 
    210     iCE = ucol_inv_findCE(src, CE, contCE);
    211 
    212     if(iCE<0) {
    213         *prevCE = UCOL_NOT_FOUND;
    214         return -1;
    215     }
    216 
    217     CE &= strengthMask[strength];
    218     contCE &= strengthMask[strength];
    219 
    220     *prevCE = CE;
    221     *prevContCE = contCE;
    222 
    223     while((*prevCE  & strengthMask[strength]) == CE
    224         && (*prevContCE  & strengthMask[strength])== contCE
    225         && iCE > 0) /* this condition should prevent falling off the edge of the world */
    226     {
    227         /* here, we end up in a singularity - zero */
    228         *prevCE = (*(CETable+3*(--iCE)));
    229         *prevContCE = (*(CETable+3*(iCE)+1));
    230     }
    231 
    232     return iCE;
    233 }
    234 
    235 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
    236                                                        uint32_t prevCE, uint32_t prevContCE)
    237 {
    238     if(prevCE == CE && prevContCE == contCE) {
    239         return UCOL_IDENTICAL;
    240     }
    241     if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
    242         || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
    243     {
    244         return UCOL_PRIMARY;
    245     }
    246     if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
    247         || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
    248     {
    249         return UCOL_SECONDARY;
    250     }
    251     return UCOL_TERTIARY;
    252 }
    253 
    254 
    255 /*static
    256 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
    257 
    258     uint32_t CE = lh->baseCE;
    259     uint32_t SecondCE = lh->baseContCE;
    260 
    261     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    262     uint32_t previousCE, previousContCE;
    263     int32_t iCE;
    264 
    265     iCE = ucol_inv_findCE(src, CE, SecondCE);
    266 
    267     if(iCE<0) {
    268         return -1;
    269     }
    270 
    271     CE &= strengthMask[strength];
    272     SecondCE &= strengthMask[strength];
    273 
    274     previousCE = CE;
    275     previousContCE = SecondCE;
    276 
    277     while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
    278         previousCE = (*(CETable+3*(--iCE)));
    279         previousContCE = (*(CETable+3*(iCE)+1));
    280     }
    281     lh->previousCE = previousCE;
    282     lh->previousContCE = previousContCE;
    283 
    284     return iCE;
    285 }*/
    286 
    287 static
    288 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
    289     uint32_t CE = lh->baseCE;
    290     uint32_t SecondCE = lh->baseContCE;
    291 
    292     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    293     uint32_t nextCE, nextContCE;
    294     int32_t iCE;
    295 
    296     iCE = ucol_inv_findCE(src, CE, SecondCE);
    297 
    298     if(iCE<0) {
    299         return -1;
    300     }
    301 
    302     CE &= strengthMask[strength];
    303     SecondCE &= strengthMask[strength];
    304 
    305     nextCE = CE;
    306     nextContCE = SecondCE;
    307 
    308     while((nextCE  & strengthMask[strength]) == CE
    309         && (nextContCE  & strengthMask[strength]) == SecondCE)
    310     {
    311         nextCE = (*(CETable+3*(++iCE)));
    312         nextContCE = (*(CETable+3*(iCE)+1));
    313     }
    314 
    315     lh->nextCE = nextCE;
    316     lh->nextContCE = nextContCE;
    317 
    318     return iCE;
    319 }
    320 
    321 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
    322     /* reset all the gaps */
    323     int32_t i = 0;
    324     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    325     uint32_t st = 0;
    326     uint32_t t1, t2;
    327     int32_t pos;
    328 
    329     UColToken *tok = lh->first;
    330     uint32_t tokStrength = tok->strength;
    331 
    332     for(i = 0; i<3; i++) {
    333         lh->gapsHi[3*i] = 0;
    334         lh->gapsHi[3*i+1] = 0;
    335         lh->gapsHi[3*i+2] = 0;
    336         lh->gapsLo[3*i] = 0;
    337         lh->gapsLo[3*i+1] = 0;
    338         lh->gapsLo[3*i+2] = 0;
    339         lh->numStr[i] = 0;
    340         lh->fStrToken[i] = NULL;
    341         lh->lStrToken[i] = NULL;
    342         lh->pos[i] = -1;
    343     }
    344 
    345     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
    346 
    347     if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
    348         //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
    349         lh->pos[0] = 0;
    350         t1 = lh->baseCE;
    351         t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
    352         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    353         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    354         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    355         uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
    356         primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
    357 
    358         t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
    359         t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
    360 
    361         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    362         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    363         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    364     } else if(lh->indirect == TRUE && lh->nextCE != 0) {
    365         //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
    366         lh->pos[0] = 0;
    367         t1 = lh->baseCE;
    368         t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
    369         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    370         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    371         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    372         t1 = lh->nextCE;
    373         t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
    374         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    375         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    376         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    377     } else {
    378         for(;;) {
    379             if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
    380                 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
    381                     lh->fStrToken[tokStrength] = tok;
    382                 } else { /* The CE must be implicit, since it's not in the table */
    383                     /* Error */
    384                     *status = U_INTERNAL_PROGRAM_ERROR;
    385                 }
    386             }
    387 
    388             while(tok != NULL && tok->strength >= tokStrength) {
    389                 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
    390                     lh->lStrToken[tokStrength] = tok;
    391                 }
    392                 tok = tok->next;
    393             }
    394             if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
    395                 /* check if previous interval is the same and merge the intervals if it is so */
    396                 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
    397                     lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
    398                     lh->fStrToken[tokStrength+1] = NULL;
    399                     lh->lStrToken[tokStrength+1] = NULL;
    400                     lh->pos[tokStrength+1] = -1;
    401                 }
    402             }
    403             if(tok != NULL) {
    404                 tokStrength = tok->strength;
    405             } else {
    406                 break;
    407             }
    408         }
    409         for(st = 0; st < 3; st++) {
    410             if((pos = lh->pos[st]) >= 0) {
    411                 t1 = *(CETable+3*(pos));
    412                 t2 = *(CETable+3*(pos)+1);
    413                 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    414                 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    415                 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    416                 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
    417                 //pos--;
    418                 //t1 = *(CETable+3*(pos));
    419                 //t2 = *(CETable+3*(pos)+1);
    420                 t1 = lh->baseCE;
    421                 t2 = lh->baseContCE;
    422                 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    423                 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    424                 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
    425             }
    426         }
    427     }
    428 }
    429 
    430 
    431 #define ucol_countBytes(value, noOfBytes)   \
    432 {                               \
    433     uint32_t mask = 0xFFFFFFFF;   \
    434     (noOfBytes) = 0;              \
    435     while(mask != 0) {            \
    436     if(((value) & mask) != 0) { \
    437     (noOfBytes)++;            \
    438     }                           \
    439     mask >>= 8;                 \
    440     }                             \
    441 }
    442 
    443 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
    444     if(U_SUCCESS(*status)) {
    445         g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
    446     }
    447     return g->current;
    448 }
    449 
    450 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
    451     /* TODO: rename to enum names */
    452     uint32_t high, low, count=1;
    453     uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
    454 
    455     if(strength == UCOL_SECONDARY) {
    456         low = UCOL_COMMON_TOP2<<24;
    457         high = 0xFFFFFFFF;
    458         count = 0xFF - UCOL_COMMON_TOP2;
    459     } else {
    460         low = UCOL_BYTE_COMMON << 24; //0x05000000;
    461         high = 0x40000000;
    462         count = 0x40 - UCOL_BYTE_COMMON;
    463     }
    464 
    465     if(tok->next != NULL && tok->next->strength == strength) {
    466         count = tok->next->toInsert;
    467     }
    468 
    469     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
    470     g->current = UCOL_BYTE_COMMON<<24;
    471 
    472     if(g->noOfRanges == 0) {
    473         *status = U_INTERNAL_PROGRAM_ERROR;
    474     }
    475     return g->current;
    476 }
    477 
    478 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
    479     uint32_t strength = tok->strength;
    480     uint32_t low = lows[fStrength*3+strength];
    481     uint32_t high = highs[fStrength*3+strength];
    482     uint32_t maxByte = 0;
    483     if(strength == UCOL_TERTIARY) {
    484         maxByte = 0x3F;
    485     } else if(strength == UCOL_PRIMARY) {
    486         maxByte = 0xFE;
    487     } else {
    488         maxByte = 0xFF;
    489     }
    490 
    491     uint32_t count = tok->toInsert;
    492 
    493     if(low >= high && strength > UCOL_PRIMARY) {
    494         int32_t s = strength;
    495         for(;;) {
    496             s--;
    497             if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
    498                 if(strength == UCOL_SECONDARY) {
    499                     if (low < UCOL_COMMON_TOP2<<24 ) {
    500                        // Override if low range is less than UCOL_COMMON_TOP2.
    501 		        low = UCOL_COMMON_TOP2<<24;
    502                     }
    503                     high = 0xFFFFFFFF;
    504                 } else {
    505                     // Override if low range is less than UCOL_COMMON_BOT3.
    506 		    if ( low < UCOL_COMMON_BOT3<<24 ) {
    507                         low = UCOL_COMMON_BOT3<<24;
    508 		    }
    509                     high = 0x40000000;
    510                 }
    511                 break;
    512             }
    513             if(s<0) {
    514                 *status = U_INTERNAL_PROGRAM_ERROR;
    515                 return 0;
    516             }
    517         }
    518     }
    519 
    520     if(low < 0x02000000) {
    521         // We must not use CE weight byte 02, so we set it as the minimum lower bound.
    522         // See http://site.icu-project.org/design/collation/bytes
    523         low = 0x02000000;
    524     }
    525 
    526     if(strength == UCOL_SECONDARY) { /* similar as simple */
    527         if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
    528             low = UCOL_COMMON_TOP2<<24;
    529         }
    530         if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
    531             high = UCOL_COMMON_TOP2<<24;
    532         }
    533         if(low < (UCOL_COMMON_BOT2<<24)) {
    534             g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
    535             g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
    536             //g->current = UCOL_COMMON_BOT2<<24;
    537             return g->current;
    538         }
    539     }
    540 
    541     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
    542     if(g->noOfRanges == 0) {
    543         *status = U_INTERNAL_PROGRAM_ERROR;
    544     }
    545     g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
    546     return g->current;
    547 }
    548 
    549 static
    550 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
    551     uint32_t i = 0;
    552     UChar c;
    553 
    554     if(U_FAILURE(*status)) {
    555         return 0;
    556     }
    557 
    558     if(sourceLen > resLen) {
    559         *status = U_MEMORY_ALLOCATION_ERROR;
    560         return 0;
    561     }
    562 
    563     for(i = 0; i < sourceLen; i++) {
    564         c = source[i];
    565         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
    566             switch(c - 0x3000) {
    567             case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
    568             case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
    569                 c++;
    570                 break;
    571             case 0xF5:
    572                 c = 0x30AB;
    573                 break;
    574             case 0xF6:
    575                 c = 0x30B1;
    576                 break;
    577             }
    578         }
    579         resBuf[i] = c;
    580     }
    581     return sourceLen;
    582 }
    583 
    584 static
    585 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
    586     uint32_t i = 0;
    587     UChar c;
    588 
    589     if(U_FAILURE(*status)) {
    590         return 0;
    591     }
    592 
    593     if(sourceLen > resLen) {
    594         *status = U_MEMORY_ALLOCATION_ERROR;
    595         return 0;
    596     }
    597 
    598     for(i = 0; i < sourceLen; i++) {
    599         c = source[i];
    600         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
    601             switch(c - 0x3000) {
    602             case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
    603             case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
    604                 c--;
    605                 break;
    606             case 0xAB:
    607                 c = 0x30F5;
    608                 break;
    609             case 0xB1:
    610                 c = 0x30F6;
    611                 break;
    612             }
    613         }
    614         resBuf[i] = c;
    615     }
    616     return sourceLen;
    617 }
    618 
    619 static
    620 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
    621     uint32_t i = 0;
    622     UChar n[128];
    623     uint32_t nLen = 0;
    624     uint32_t uCount = 0, lCount = 0;
    625 
    626     collIterate s;
    627     uint32_t order = 0;
    628 
    629     if(U_FAILURE(*status)) {
    630         return UCOL_LOWER_CASE;
    631     }
    632 
    633     nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
    634     if(U_SUCCESS(*status)) {
    635         for(i = 0; i < nLen; i++) {
    636             uprv_init_collIterate(UCA, &n[i], 1, &s, status);
    637             order = ucol_getNextCE(UCA, &s, status);
    638             if(isContinuation(order)) {
    639                 *status = U_INTERNAL_PROGRAM_ERROR;
    640                 return UCOL_LOWER_CASE;
    641             }
    642             if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
    643                 uCount++;
    644             } else {
    645                 if(u_islower(n[i])) {
    646                     lCount++;
    647                 } else if(U_SUCCESS(*status)) {
    648                     UChar sk[1], lk[1];
    649                     u_toSmallKana(&n[i], 1, sk, 1, status);
    650                     u_toLargeKana(&n[i], 1, lk, 1, status);
    651                     if(sk[0] == n[i] && lk[0] != n[i]) {
    652                         lCount++;
    653                     }
    654                 }
    655             }
    656         }
    657     }
    658 
    659     if(uCount != 0 && lCount != 0) {
    660         return UCOL_MIXED_CASE;
    661     } else if(uCount != 0) {
    662         return UCOL_UPPER_CASE;
    663     } else {
    664         return UCOL_LOWER_CASE;
    665     }
    666 }
    667 
    668 
    669 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
    670     /* this one makes the table and stuff */
    671     uint32_t noOfBytes[3];
    672     uint32_t i;
    673 
    674     for(i = 0; i<3; i++) {
    675         ucol_countBytes(CEparts[i], noOfBytes[i]);
    676     }
    677 
    678     /* Here we have to pack CEs from parts */
    679 
    680     uint32_t CEi = 0;
    681     uint32_t value = 0;
    682 
    683     while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
    684         if(CEi > 0) {
    685             value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
    686         } else {
    687             value = 0;
    688         }
    689 
    690         if(2*CEi<noOfBytes[0]) {
    691             value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
    692         }
    693         if(CEi<noOfBytes[1]) {
    694             value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
    695         }
    696         if(CEi<noOfBytes[2]) {
    697             value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
    698         }
    699         tok->CEs[CEi] = value;
    700         CEi++;
    701     }
    702     if(CEi == 0) { /* totally ignorable */
    703         tok->noOfCEs = 1;
    704         tok->CEs[0] = 0;
    705     } else { /* there is at least something */
    706         tok->noOfCEs = CEi;
    707     }
    708 
    709 
    710     // we want to set case bits here and now, not later.
    711     // Case bits handling
    712     if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
    713         tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
    714         int32_t cSize = (tok->source & 0xFF000000) >> 24;
    715         UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
    716 
    717         if(cSize > 1) {
    718             // Do it manually
    719             tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
    720         } else {
    721             // Copy it from the UCA
    722             uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
    723             tok->CEs[0] |= (caseCE & 0xC0);
    724         }
    725     }
    726 
    727 #if UCOL_DEBUG==2
    728     fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
    729     for(i = 0; i<tok->noOfCEs; i++) {
    730         fprintf(stderr, "%08X ", tok->CEs[i]);
    731     }
    732     fprintf(stderr, "\n");
    733 #endif
    734 }
    735 
    736 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
    737     ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
    738     uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
    739 
    740     UColToken *tok = lh->last;
    741     uint32_t t[UCOL_STRENGTH_LIMIT];
    742 
    743     uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
    744 
    745     tok->toInsert = 1;
    746     t[tok->strength] = 1;
    747 
    748     while(tok->previous != NULL) {
    749         if(tok->previous->strength < tok->strength) { /* going up */
    750             t[tok->strength] = 0;
    751             t[tok->previous->strength]++;
    752         } else if(tok->previous->strength > tok->strength) { /* going down */
    753             t[tok->previous->strength] = 1;
    754         } else {
    755             t[tok->strength]++;
    756         }
    757         tok=tok->previous;
    758         tok->toInsert = t[tok->strength];
    759     }
    760 
    761     tok->toInsert = t[tok->strength];
    762     ucol_inv_getGapPositions(src, lh, status);
    763 
    764 #if UCOL_DEBUG
    765     fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
    766     int32_t j = 2;
    767     for(j = 2; j >= 0; j--) {
    768         fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
    769         fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
    770     }
    771     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
    772 
    773     do {
    774         fprintf(stderr,"%i", tok->strength);
    775         tok = tok->next;
    776     } while(tok != NULL);
    777     fprintf(stderr, "\n");
    778 
    779     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
    780 
    781     do {
    782         fprintf(stderr,"%i", tok->toInsert);
    783         tok = tok->next;
    784     } while(tok != NULL);
    785 #endif
    786 
    787     tok = lh->first;
    788     uint32_t fStrength = UCOL_IDENTICAL;
    789     uint32_t initStrength = UCOL_IDENTICAL;
    790 
    791 
    792     CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
    793     CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
    794     CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
    795 
    796     while (tok != NULL && U_SUCCESS(*status)) {
    797         fStrength = tok->strength;
    798         if(fStrength < initStrength) {
    799             initStrength = fStrength;
    800             if(lh->pos[fStrength] == -1) {
    801                 while(lh->pos[fStrength] == -1 && fStrength > 0) {
    802                     fStrength--;
    803                 }
    804                 if(lh->pos[fStrength] == -1) {
    805                     *status = U_INTERNAL_PROGRAM_ERROR;
    806                     return;
    807                 }
    808             }
    809             if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
    810                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
    811                 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
    812                 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
    813                 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
    814             } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
    815                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
    816                 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
    817                 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
    818                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    819             } else { /* primaries */
    820                 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
    821                 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
    822                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
    823                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    824             }
    825         } else {
    826             if(tok->strength == UCOL_TERTIARY) {
    827                 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
    828             } else if(tok->strength == UCOL_SECONDARY) {
    829                 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
    830                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    831             } else if(tok->strength == UCOL_PRIMARY) {
    832                 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
    833                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
    834                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    835             }
    836         }
    837         ucol_doCE(src, CEparts, tok, status);
    838         tok = tok->next;
    839     }
    840 }
    841 
    842 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
    843     UCAElements el;
    844     UColToken *tok = lh->first;
    845     UColToken *expt = NULL;
    846     uint32_t i = 0, j = 0;
    847     UChar32 fcdHighStart;
    848     const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
    849 
    850     while(tok != NULL && U_SUCCESS(*status)) {
    851         /* first, check if there are any expansions */
    852         /* if there are expansions, we need to do a little bit more processing */
    853         /* since parts of expansion can be tailored, while others are not */
    854         if(tok->expansion != 0) {
    855             uint32_t len = tok->expansion >> 24;
    856             uint32_t currentSequenceLen = len;
    857             uint32_t expOffset = tok->expansion & 0x00FFFFFF;
    858             //uint32_t exp = currentSequenceLen | expOffset;
    859             UColToken exp;
    860             exp.source = currentSequenceLen | expOffset;
    861             exp.rulesToParseHdl = &(src->source);
    862 
    863             while(len > 0) {
    864                 currentSequenceLen = len;
    865                 while(currentSequenceLen > 0) {
    866                     exp.source = (currentSequenceLen << 24) | expOffset;
    867                     if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
    868                         uint32_t noOfCEsToCopy = expt->noOfCEs;
    869                         for(j = 0; j<noOfCEsToCopy; j++) {
    870                             tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
    871                         }
    872                         tok->noOfExpCEs += noOfCEsToCopy;
    873                         // Smart people never try to add codepoints and CEs.
    874                         // For some odd reason, it won't work.
    875                         expOffset += currentSequenceLen; //noOfCEsToCopy;
    876                         len -= currentSequenceLen; //noOfCEsToCopy;
    877                         break;
    878                     } else {
    879                         currentSequenceLen--;
    880                     }
    881                 }
    882                 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
    883                     /* will have to get one from UCA */
    884                     /* first, get the UChars from the rules */
    885                     /* then pick CEs out until there is no more and stuff them into expansion */
    886                     collIterate s;
    887                     uint32_t order = 0;
    888                     uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
    889 
    890                     for(;;) {
    891                         order = ucol_getNextCE(src->UCA, &s, status);
    892                         if(order == UCOL_NO_MORE_CES) {
    893                             break;
    894                         }
    895                         tok->expCEs[tok->noOfExpCEs++] = order;
    896                     }
    897                     expOffset++;
    898                     len--;
    899                 }
    900             }
    901         } else {
    902             tok->noOfExpCEs = 0;
    903         }
    904 
    905         /* set the ucaelement with obtained values */
    906         el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
    907         /* copy CEs */
    908         for(i = 0; i<tok->noOfCEs; i++) {
    909             el.CEs[i] = tok->CEs[i];
    910         }
    911         for(i = 0; i<tok->noOfExpCEs; i++) {
    912             el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
    913         }
    914 
    915         /* copy UChars */
    916         // We kept prefix and source kind of together, as it is a kind of a contraction.
    917         // However, now we have to slice the prefix off the main thing -
    918         el.prefix = el.prefixChars;
    919         el.cPoints = el.uchars;
    920         if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
    921             // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
    922             // decomposed elements to the unsaf table.
    923             el.prefixSize = tok->prefix>>24;
    924             uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
    925 
    926             el.cSize = (tok->source >> 24)-(tok->prefix>>24);
    927             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
    928         } else {
    929             el.prefixSize = 0;
    930             *el.prefix = 0;
    931 
    932             el.cSize = (tok->source >> 24);
    933             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
    934         }
    935         if(src->UCA != NULL) {
    936             for(i = 0; i<el.cSize; i++) {
    937                 if(UCOL_ISJAMO(el.cPoints[i])) {
    938                     t->image->jamoSpecial = TRUE;
    939                 }
    940             }
    941             if (!src->buildCCTabFlag && el.cSize > 0) {
    942                 // Check the trailing canonical combining class (tccc) of the last character.
    943                 const UChar *s = el.cPoints + el.cSize;
    944                 uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cPoints, s);
    945                 if ((fcd & 0xff) != 0) {
    946                     src->buildCCTabFlag = TRUE;
    947                 }
    948             }
    949         }
    950 
    951         /* and then, add it */
    952 #if UCOL_DEBUG==2
    953         fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
    954 #endif
    955         uprv_uca_addAnElement(t, &el, status);
    956 
    957 #if UCOL_DEBUG_DUPLICATES
    958         if(*status != U_ZERO_ERROR) {
    959             fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
    960             *status = U_ZERO_ERROR;
    961         }
    962 #endif
    963 
    964         tok = tok->next;
    965     }
    966 }
    967 
    968 U_CDECL_BEGIN
    969 static UBool U_CALLCONV
    970 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
    971     UErrorCode status = U_ZERO_ERROR;
    972     tempUCATable *t = (tempUCATable *)context;
    973     if(value == 0) {
    974         while(start < limit) {
    975             uint32_t CE = utrie_get32(t->mapping, start, NULL);
    976             if(CE == UCOL_NOT_FOUND) {
    977                 UCAElements el;
    978                 el.isThai = FALSE;
    979                 el.prefixSize = 0;
    980                 el.prefixChars[0] = 0;
    981                 el.prefix = el.prefixChars;
    982                 el.cPoints = el.uchars;
    983 
    984                 el.cSize = 0;
    985                 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
    986 
    987                 el.noOfCEs = 1;
    988                 el.CEs[0] = 0;
    989                 uprv_uca_addAnElement(t, &el, &status);
    990 
    991             }
    992             start++;
    993         }
    994     }
    995     if(U_FAILURE(status)) {
    996         return FALSE;
    997     } else {
    998         return TRUE;
    999     }
   1000 }
   1001 U_CDECL_END
   1002 
   1003 static void
   1004 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
   1005                                UChar32 start, UChar32 end,
   1006                                UErrorCode *status)
   1007 {
   1008     //UChar decomp[256];
   1009     uint32_t CE = UCOL_NOT_FOUND;
   1010     UChar32 u = 0;
   1011     UCAElements el;
   1012     el.isThai = FALSE;
   1013     el.prefixSize = 0;
   1014     el.prefixChars[0] = 0;
   1015     collIterate colIt;
   1016 
   1017     if(U_SUCCESS(*status)) {
   1018         for(u = start; u<=end; u++) {
   1019             if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
   1020                 /* this test is for contractions that are missing the starting element. */
   1021                 || ((isCntTableElement(CE)) &&
   1022                 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
   1023                 )
   1024             {
   1025                 el.cSize = 0;
   1026                 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
   1027                 //decomp[0] = (UChar)u;
   1028                 //el.uchars[0] = (UChar)u;
   1029                 el.cPoints = el.uchars;
   1030                 //el.cSize = 1;
   1031                 el.noOfCEs = 0;
   1032                 el.prefix = el.prefixChars;
   1033                 el.prefixSize = 0;
   1034                 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
   1035                 // We actually want to check whether this element is a special
   1036                 // If it is an implicit element (hangul, CJK - we want to copy the
   1037                 // special, not the resolved CEs) - for hangul, copying resolved
   1038                 // would just make things the same (there is an expansion and it
   1039                 // takes approximately the same amount of time to resolve as
   1040                 // falling back to the UCA).
   1041                 /*
   1042                 UTRIE_GET32(src->UCA->mapping, u, CE);
   1043                 tag = getCETag(CE);
   1044                 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
   1045                 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
   1046                 || tag == LEAD_SURROGATE_TAG) {
   1047                 el.CEs[el.noOfCEs++] = CE;
   1048                 } else {
   1049                 */
   1050                 // It turns out that it does not make sense to keep implicits
   1051                 // unresolved. The cost of resolving them is big enough so that
   1052                 // it doesn't make any difference whether we have to go to the UCA
   1053                 // or not.
   1054                 {
   1055                     uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
   1056                     while(CE != UCOL_NO_MORE_CES) {
   1057                         CE = ucol_getNextCE(src->UCA, &colIt, status);
   1058                         if(CE != UCOL_NO_MORE_CES) {
   1059                             el.CEs[el.noOfCEs++] = CE;
   1060                         }
   1061                     }
   1062                 }
   1063                 uprv_uca_addAnElement(t, &el, status);
   1064             }
   1065         }
   1066     }
   1067 }
   1068 
   1069 U_CFUNC UCATableHeader *
   1070 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
   1071     U_NAMESPACE_USE
   1072 
   1073     uint32_t i = 0;
   1074     if(U_FAILURE(*status)) {
   1075         return NULL;
   1076     }
   1077     /*
   1078     2.  Eliminate the negative lists by doing the following for each non-null negative list:
   1079     o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
   1080     create new ListHeader X
   1081     o   reverse the list, add to the end of X's positive list. Reset the strength of the
   1082     first item you add, based on the stronger strength levels of the two lists.
   1083     */
   1084     /*
   1085     3.  For each ListHeader with a non-null positive list:
   1086     */
   1087     /*
   1088     o   Find all character strings with CEs between the baseCE and the
   1089     next/previous CE, at the strength of the first token. Add these to the
   1090     tailoring.
   1091     ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
   1092     tailoring has & x < z...
   1093     ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
   1094     */
   1095     /* It is possible that this part should be done even while constructing list */
   1096     /* The problem is that it is unknown what is going to be the strongest weight */
   1097     /* So we might as well do it here */
   1098 
   1099     /*
   1100     o   Allocate CEs for each token in the list, based on the total number N of the
   1101     largest level difference, and the gap G between baseCE and nextCE at that
   1102     level. The relation * between the last item and nextCE is the same as the
   1103     strongest strength.
   1104     o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
   1105     ? There are 3 primary items: a, d, e. Fit them into the primary gap.
   1106     Then fit b and c into the secondary gap between a and d, then fit q
   1107     into the tertiary gap between b and c.
   1108 
   1109     o   Example: baseCE << b <<< q << c * nextCE(X,2)
   1110     ? There are 2 secondary items: b, c. Fit them into the secondary gap.
   1111     Then fit q into the tertiary gap between b and c.
   1112     o   When incrementing primary values, we will not cross high byte
   1113     boundaries except where there is only a single-byte primary. That is to
   1114     ensure that the script reordering will continue to work.
   1115     */
   1116     UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
   1117     /* test for NULL */
   1118     if (image == NULL) {
   1119         *status = U_MEMORY_ALLOCATION_ERROR;
   1120         return NULL;
   1121     }
   1122     uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
   1123 
   1124     for(i = 0; i<src->resultLen; i++) {
   1125         /* now we need to generate the CEs */
   1126         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
   1127         /* According to strength                                                          */
   1128         if(U_SUCCESS(*status)) {
   1129             if(src->lh[i].first) { // if there are any elements
   1130                 // due to the way parser works, subsequent tailorings
   1131                 // may remove all the elements from a sequence, therefore
   1132                 // leaving an empty tailoring sequence.
   1133                 ucol_initBuffers(src, &src->lh[i], status);
   1134             }
   1135         }
   1136         if(U_FAILURE(*status)) {
   1137             uprv_free(image);
   1138             return NULL;
   1139         }
   1140     }
   1141 
   1142     if(src->varTop != NULL) { /* stuff the variable top value */
   1143         src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
   1144         /* remove it from the list */
   1145         if(src->varTop->listHeader->first == src->varTop) { /* first in list */
   1146             src->varTop->listHeader->first = src->varTop->next;
   1147         }
   1148         if(src->varTop->listHeader->last == src->varTop) { /* first in list */
   1149             src->varTop->listHeader->last = src->varTop->previous;
   1150         }
   1151         if(src->varTop->next != NULL) {
   1152             src->varTop->next->previous = src->varTop->previous;
   1153         }
   1154         if(src->varTop->previous != NULL) {
   1155             src->varTop->previous->next = src->varTop->next;
   1156         }
   1157     }
   1158 
   1159 
   1160     tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
   1161     if(U_FAILURE(*status)) {
   1162         uprv_free(image);
   1163         return NULL;
   1164     }
   1165 
   1166 
   1167     /* After this, we have assigned CE values to all regular CEs      */
   1168     /* now we will go through list once more and resolve expansions,  */
   1169     /* make UCAElements structs and add them to table                 */
   1170     for(i = 0; i<src->resultLen; i++) {
   1171         /* now we need to generate the CEs */
   1172         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
   1173         /* According to strength                                                          */
   1174         if(U_SUCCESS(*status)) {
   1175             ucol_createElements(src, t, &src->lh[i], status);
   1176         }
   1177     }
   1178 
   1179     UCAElements el;
   1180     el.isThai = FALSE;
   1181     el.prefixSize = 0;
   1182     el.prefixChars[0] = 0;
   1183 
   1184     /* add latin-1 stuff */
   1185     ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
   1186 
   1187     /* add stuff for copying */
   1188     if(src->copySet != NULL) {
   1189         int32_t i = 0;
   1190         UnicodeSet *set = (UnicodeSet *)src->copySet;
   1191         for(i = 0; i < set->getRangeCount(); i++) {
   1192             ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
   1193         }
   1194     }
   1195 
   1196     if(U_SUCCESS(*status)) {
   1197         /* copy contractions from the UCA - this is felt mostly for cyrillic*/
   1198 
   1199         uint32_t tailoredCE = UCOL_NOT_FOUND;
   1200         //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
   1201         UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
   1202         UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
   1203         // Check for null pointer
   1204         if (ucaEl == NULL) {
   1205         	*status = U_MEMORY_ALLOCATION_ERROR;
   1206         	return NULL;
   1207         }
   1208         while(*conts != 0) {
   1209             /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
   1210             tailoredCE = utrie_get32(t->mapping, *conts, NULL);
   1211             if(tailoredCE != UCOL_NOT_FOUND) {
   1212                 UBool needToAdd = TRUE;
   1213                 if(isCntTableElement(tailoredCE)) {
   1214                     if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) {
   1215                         needToAdd = FALSE;
   1216                     }
   1217                 }
   1218                 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
   1219                     UCAElements elm;
   1220                     elm.cPoints = el.uchars;
   1221                     elm.noOfCEs = 0;
   1222                     elm.uchars[0] = *conts;
   1223                     elm.uchars[1] = 0;
   1224                     elm.cSize = 1;
   1225                     elm.prefixChars[0] = *(conts+2);
   1226                     elm.isThai = FALSE;
   1227                     elm.prefix = elm.prefixChars;
   1228                     elm.prefixSize = 1;
   1229                     UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
   1230                     if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
   1231                         needToAdd = TRUE;
   1232                     }
   1233                 }
   1234                 if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
   1235                     needToAdd = FALSE;
   1236                 }
   1237 
   1238                 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
   1239                     if (*(conts+1) != 0) {  // contractions
   1240                         el.prefix = el.prefixChars;
   1241                         el.prefixSize = 0;
   1242                         el.cPoints = el.uchars;
   1243                         el.noOfCEs = 0;
   1244                         el.uchars[0] = *conts;
   1245                         el.uchars[1] = *(conts+1);
   1246                         if(*(conts+2)!=0) {
   1247                             el.uchars[2] = *(conts+2);
   1248                             el.cSize = 3;
   1249                         } else {
   1250                             el.cSize = 2;
   1251                         }
   1252                         ucol_setText(ucaEl, el.uchars, el.cSize, status);
   1253                     }
   1254                     else { // pre-context character
   1255                         UChar str[4] = { 0 };
   1256                         int32_t len=0;
   1257                         int32_t preKeyLen=0;
   1258 
   1259                         el.cPoints = el.uchars;
   1260                         el.noOfCEs = 0;
   1261                         el.uchars[0] = *conts;
   1262                         el.uchars[1] = 0;
   1263                         el.cSize = 1;
   1264                         el.prefixChars[0] = *(conts+2);
   1265                         el.prefix = el.prefixChars;
   1266                         el.prefixSize = 1;
   1267                         if (el.prefixChars[0]!=0) {
   1268                             // get CE of prefix character first
   1269                             str[0]=el.prefixChars[0];
   1270                             str[1]=0;
   1271                             ucol_setText(ucaEl, str, 1, status);
   1272                             while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
   1273                                     != UCOL_NULLORDER) {
   1274                                 preKeyLen++;  // count number of keys for prefix character
   1275                             }
   1276                             str[len++] = el.prefixChars[0];
   1277                         }
   1278 
   1279                         str[len++] = el.uchars[0];
   1280                         str[len]=0;
   1281                         ucol_setText(ucaEl, str, len, status);
   1282                         // Skip the keys for prefix character, then copy the rest to el.
   1283                         while ((preKeyLen-->0) &&
   1284                                (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
   1285                             continue;
   1286                         }
   1287 
   1288                     }
   1289                     while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
   1290                         el.noOfCEs++;
   1291                     }
   1292                     uprv_uca_addAnElement(t, &el, status);
   1293                 }
   1294 
   1295             } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
   1296                 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
   1297             }
   1298             conts+=3;
   1299         }
   1300         ucol_closeElements(ucaEl);
   1301     }
   1302 
   1303     // Add completely ignorable elements
   1304     utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
   1305 
   1306     // add tailoring characters related canonical closures
   1307     uprv_uca_canonicalClosure(t, src, NULL, status);
   1308 
   1309     /* still need to produce compatibility closure */
   1310 
   1311     UCATableHeader *myData = uprv_uca_assembleTable(t, status);
   1312 
   1313     uprv_uca_closeTempTable(t);
   1314     uprv_free(image);
   1315 
   1316     return myData;
   1317 }
   1318 
   1319 U_CDECL_BEGIN
   1320 static UBool U_CALLCONV
   1321 ucol_bld_cleanup(void)
   1322 {
   1323     udata_close(invUCA_DATA_MEM);
   1324     invUCA_DATA_MEM = NULL;
   1325     _staticInvUCA = NULL;
   1326     return TRUE;
   1327 }
   1328 U_CDECL_END
   1329 
   1330 U_CAPI const InverseUCATableHeader * U_EXPORT2
   1331 ucol_initInverseUCA(UErrorCode *status)
   1332 {
   1333     if(U_FAILURE(*status)) return NULL;
   1334 
   1335     UBool needsInit;
   1336     UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
   1337 
   1338     if(needsInit) {
   1339         InverseUCATableHeader *newInvUCA = NULL;
   1340         UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
   1341 
   1342         if(U_FAILURE(*status)) {
   1343             if (result) {
   1344                 udata_close(result);
   1345             }
   1346             // This is not needed, as we are talking about
   1347             // memory we got from UData
   1348             //uprv_free(newInvUCA);
   1349         }
   1350 
   1351         if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
   1352             newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
   1353             UCollator *UCA = ucol_initUCA(status);
   1354             // UCA versions of UCA and inverse UCA should match
   1355             if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
   1356                 *status = U_INVALID_FORMAT_ERROR;
   1357                 udata_close(result);
   1358                 return NULL;
   1359             }
   1360 
   1361             umtx_lock(NULL);
   1362             if(_staticInvUCA == NULL) {
   1363                 invUCA_DATA_MEM = result;
   1364                 _staticInvUCA = newInvUCA;
   1365                 result = NULL;
   1366                 newInvUCA = NULL;
   1367             }
   1368             umtx_unlock(NULL);
   1369 
   1370             if(newInvUCA != NULL) {
   1371                 udata_close(result);
   1372                 // This is not needed, as we are talking about
   1373                 // memory we got from UData
   1374                 //uprv_free(newInvUCA);
   1375             }
   1376             else {
   1377                 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
   1378             }
   1379         }
   1380     }
   1381     return _staticInvUCA;
   1382 }
   1383 
   1384 /* This is the data that is used for non-script reordering codes. These _must_ be kept
   1385  * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
   1386  */
   1387 static const char* ReorderingTokenNames[] = {
   1388     "SPACE",
   1389     "PUNCT",
   1390     "SYMBOL",
   1391     "CURRENCY",
   1392     "DIGIT",
   1393     NULL
   1394 };
   1395 
   1396 static void toUpper(const char* src, char* dst, uint32_t length) {
   1397    for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
   1398        *dst = toupper(*src);
   1399    }
   1400    *dst = '\0';
   1401 }
   1402 
   1403 U_INTERNAL int32_t U_EXPORT2
   1404 ucol_findReorderingEntry(const char* name) {
   1405     char buffer[32];
   1406     toUpper(name, buffer, 32);
   1407     for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) {
   1408         if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
   1409             return entry + UCOL_REORDER_CODE_FIRST;
   1410         }
   1411     }
   1412     return USCRIPT_INVALID_CODE;
   1413 }
   1414 
   1415 U_NAMESPACE_END
   1416 
   1417 #endif /* #if !UCONFIG_NO_COLLATION */
   1418