Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2001-2011, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucol_bld.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created 02/22/2001
     14 *   created by: Vladimir Weinstein
     15 *
     16 * This module builds a collator based on the rule set.
     17 *
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 
     22 #if !UCONFIG_NO_COLLATION
     23 
     24 #include "unicode/ucoleitr.h"
     25 #include "unicode/udata.h"
     26 #include "unicode/uchar.h"
     27 #include "unicode/uniset.h"
     28 #include "unicode/uscript.h"
     29 #include "unicode/ustring.h"
     30 #include "normalizer2impl.h"
     31 #include "ucol_bld.h"
     32 #include "ucol_elm.h"
     33 #include "ucol_cnt.h"
     34 #include "ucln_in.h"
     35 #include "umutex.h"
     36 #include "cmemory.h"
     37 #include "cstring.h"
     38 
     39 static const InverseUCATableHeader* _staticInvUCA = NULL;
     40 static UDataMemory* invUCA_DATA_MEM = NULL;
     41 
     42 U_CDECL_BEGIN
     43 static UBool U_CALLCONV
     44 isAcceptableInvUCA(void * /*context*/,
     45                    const char * /*type*/, const char * /*name*/,
     46                    const UDataInfo *pInfo)
     47 {
     48     /* context, type & name are intentionally not used */
     49     if( pInfo->size>=20 &&
     50         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
     51         pInfo->charsetFamily==U_CHARSET_FAMILY &&
     52         pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
     53         pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
     54         pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
     55         pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
     56         pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
     57         pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
     58         //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
     59         //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
     60         //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
     61         )
     62     {
     63         UVersionInfo UCDVersion;
     64         u_getUnicodeVersion(UCDVersion);
     65         return (pInfo->dataVersion[0]==UCDVersion[0] &&
     66             pInfo->dataVersion[1]==UCDVersion[1]);
     67             //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
     68             //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
     69             //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
     70     } else {
     71         return FALSE;
     72     }
     73 }
     74 U_CDECL_END
     75 
     76 /*
     77 * Takes two CEs (lead and continuation) and
     78 * compares them as CEs should be compared:
     79 * primary vs. primary, secondary vs. secondary
     80 * tertiary vs. tertiary
     81 */
     82 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
     83     uint32_t s1 = source0, s2, t1 = target0, t2;
     84     if(isContinuation(source1)) {
     85         s2 = source1;
     86     } else {
     87         s2 = 0;
     88     }
     89     if(isContinuation(target1)) {
     90         t2 = target1;
     91     } else {
     92         t2 = 0;
     93     }
     94 
     95     uint32_t s = 0, t = 0;
     96     if(s1 == t1 && s2 == t2) {
     97         return 0;
     98     }
     99     s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
    100     t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
    101     if(s < t) {
    102         return -1;
    103     } else if(s > t) {
    104         return 1;
    105     } else {
    106         s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
    107         t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
    108         if(s < t) {
    109             return -1;
    110         } else if(s > t) {
    111             return 1;
    112         } else {
    113             s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
    114             t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
    115             if(s < t) {
    116                 return -1;
    117             } else {
    118                 return 1;
    119             }
    120         }
    121     }
    122 }
    123 
    124 static
    125 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
    126     uint32_t bottom = 0, top = src->invUCA->tableSize;
    127     uint32_t i = 0;
    128     uint32_t first = 0, second = 0;
    129     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    130     int32_t res = 0;
    131 
    132     while(bottom < top-1) {
    133         i = (top+bottom)/2;
    134         first = *(CETable+3*i);
    135         second = *(CETable+3*i+1);
    136         res = compareCEs(first, second, CE, SecondCE);
    137         if(res > 0) {
    138             top = i;
    139         } else if(res < 0) {
    140             bottom = i;
    141         } else {
    142             break;
    143         }
    144     }
    145 
    146     /* weiv:                                                  */
    147     /* in searching for elements, I have removed the failure  */
    148     /* The reason for this is that the builder does not rely  */
    149     /* on search mechanism telling it that it didn't find an  */
    150     /* element. However, indirect positioning relies on being */
    151     /* able to find the elements around any CE, even if it is */
    152     /* not defined in the UCA. */
    153     return i;
    154     /*
    155     if((first == CE && second == SecondCE)) {
    156     return i;
    157     } else {
    158     return -1;
    159     }
    160     */
    161 }
    162 
    163 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
    164     0xFFFF0000,
    165     0xFFFFFF00,
    166     0xFFFFFFFF
    167 };
    168 
    169 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
    170                                             uint32_t CE, uint32_t contCE,
    171                                             uint32_t *nextCE, uint32_t *nextContCE,
    172                                             uint32_t strength)
    173 {
    174     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    175     int32_t iCE;
    176 
    177     iCE = ucol_inv_findCE(src, CE, contCE);
    178 
    179     if(iCE<0) {
    180         *nextCE = UCOL_NOT_FOUND;
    181         return -1;
    182     }
    183 
    184     CE &= strengthMask[strength];
    185     contCE &= strengthMask[strength];
    186 
    187     *nextCE = CE;
    188     *nextContCE = contCE;
    189 
    190     while((*nextCE  & strengthMask[strength]) == CE
    191         && (*nextContCE  & strengthMask[strength]) == contCE)
    192     {
    193         *nextCE = (*(CETable+3*(++iCE)));
    194         *nextContCE = (*(CETable+3*(iCE)+1));
    195     }
    196 
    197     return iCE;
    198 }
    199 
    200 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
    201                                             uint32_t CE, uint32_t contCE,
    202                                             uint32_t *prevCE, uint32_t *prevContCE,
    203                                             uint32_t strength)
    204 {
    205     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    206     int32_t iCE;
    207 
    208     iCE = ucol_inv_findCE(src, CE, contCE);
    209 
    210     if(iCE<0) {
    211         *prevCE = UCOL_NOT_FOUND;
    212         return -1;
    213     }
    214 
    215     CE &= strengthMask[strength];
    216     contCE &= strengthMask[strength];
    217 
    218     *prevCE = CE;
    219     *prevContCE = contCE;
    220 
    221     while((*prevCE  & strengthMask[strength]) == CE
    222         && (*prevContCE  & strengthMask[strength])== contCE
    223         && iCE > 0) /* this condition should prevent falling off the edge of the world */
    224     {
    225         /* here, we end up in a singularity - zero */
    226         *prevCE = (*(CETable+3*(--iCE)));
    227         *prevContCE = (*(CETable+3*(iCE)+1));
    228     }
    229 
    230     return iCE;
    231 }
    232 
    233 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
    234                                                        uint32_t prevCE, uint32_t prevContCE)
    235 {
    236     if(prevCE == CE && prevContCE == contCE) {
    237         return UCOL_IDENTICAL;
    238     }
    239     if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
    240         || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
    241     {
    242         return UCOL_PRIMARY;
    243     }
    244     if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
    245         || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
    246     {
    247         return UCOL_SECONDARY;
    248     }
    249     return UCOL_TERTIARY;
    250 }
    251 
    252 
    253 /*static
    254 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
    255 
    256     uint32_t CE = lh->baseCE;
    257     uint32_t SecondCE = lh->baseContCE;
    258 
    259     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    260     uint32_t previousCE, previousContCE;
    261     int32_t iCE;
    262 
    263     iCE = ucol_inv_findCE(src, CE, SecondCE);
    264 
    265     if(iCE<0) {
    266         return -1;
    267     }
    268 
    269     CE &= strengthMask[strength];
    270     SecondCE &= strengthMask[strength];
    271 
    272     previousCE = CE;
    273     previousContCE = SecondCE;
    274 
    275     while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
    276         previousCE = (*(CETable+3*(--iCE)));
    277         previousContCE = (*(CETable+3*(iCE)+1));
    278     }
    279     lh->previousCE = previousCE;
    280     lh->previousContCE = previousContCE;
    281 
    282     return iCE;
    283 }*/
    284 
    285 static
    286 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
    287     uint32_t CE = lh->baseCE;
    288     uint32_t SecondCE = lh->baseContCE;
    289 
    290     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    291     uint32_t nextCE, nextContCE;
    292     int32_t iCE;
    293 
    294     iCE = ucol_inv_findCE(src, CE, SecondCE);
    295 
    296     if(iCE<0) {
    297         return -1;
    298     }
    299 
    300     CE &= strengthMask[strength];
    301     SecondCE &= strengthMask[strength];
    302 
    303     nextCE = CE;
    304     nextContCE = SecondCE;
    305 
    306     while((nextCE  & strengthMask[strength]) == CE
    307         && (nextContCE  & strengthMask[strength]) == SecondCE)
    308     {
    309         nextCE = (*(CETable+3*(++iCE)));
    310         nextContCE = (*(CETable+3*(iCE)+1));
    311     }
    312 
    313     lh->nextCE = nextCE;
    314     lh->nextContCE = nextContCE;
    315 
    316     return iCE;
    317 }
    318 
    319 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
    320     /* reset all the gaps */
    321     int32_t i = 0;
    322     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    323     uint32_t st = 0;
    324     uint32_t t1, t2;
    325     int32_t pos;
    326 
    327     UColToken *tok = lh->first;
    328     uint32_t tokStrength = tok->strength;
    329 
    330     for(i = 0; i<3; i++) {
    331         lh->gapsHi[3*i] = 0;
    332         lh->gapsHi[3*i+1] = 0;
    333         lh->gapsHi[3*i+2] = 0;
    334         lh->gapsLo[3*i] = 0;
    335         lh->gapsLo[3*i+1] = 0;
    336         lh->gapsLo[3*i+2] = 0;
    337         lh->numStr[i] = 0;
    338         lh->fStrToken[i] = NULL;
    339         lh->lStrToken[i] = NULL;
    340         lh->pos[i] = -1;
    341     }
    342 
    343     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
    344 
    345     if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
    346         //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
    347         lh->pos[0] = 0;
    348         t1 = lh->baseCE;
    349         t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
    350         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    351         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    352         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    353         uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
    354         primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
    355 
    356         t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
    357         t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
    358 
    359         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    360         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    361         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    362     } else if(lh->indirect == TRUE && lh->nextCE != 0) {
    363         //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
    364         lh->pos[0] = 0;
    365         t1 = lh->baseCE;
    366         t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
    367         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    368         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    369         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    370         t1 = lh->nextCE;
    371         t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
    372         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    373         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    374         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    375     } else {
    376         for(;;) {
    377             if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
    378                 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
    379                     lh->fStrToken[tokStrength] = tok;
    380                 } else { /* The CE must be implicit, since it's not in the table */
    381                     /* Error */
    382                     *status = U_INTERNAL_PROGRAM_ERROR;
    383                 }
    384             }
    385 
    386             while(tok != NULL && tok->strength >= tokStrength) {
    387                 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
    388                     lh->lStrToken[tokStrength] = tok;
    389                 }
    390                 tok = tok->next;
    391             }
    392             if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
    393                 /* check if previous interval is the same and merge the intervals if it is so */
    394                 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
    395                     lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
    396                     lh->fStrToken[tokStrength+1] = NULL;
    397                     lh->lStrToken[tokStrength+1] = NULL;
    398                     lh->pos[tokStrength+1] = -1;
    399                 }
    400             }
    401             if(tok != NULL) {
    402                 tokStrength = tok->strength;
    403             } else {
    404                 break;
    405             }
    406         }
    407         for(st = 0; st < 3; st++) {
    408             if((pos = lh->pos[st]) >= 0) {
    409                 t1 = *(CETable+3*(pos));
    410                 t2 = *(CETable+3*(pos)+1);
    411                 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    412                 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    413                 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    414                 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
    415                 //pos--;
    416                 //t1 = *(CETable+3*(pos));
    417                 //t2 = *(CETable+3*(pos)+1);
    418                 t1 = lh->baseCE;
    419                 t2 = lh->baseContCE;
    420                 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    421                 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    422                 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
    423             }
    424         }
    425     }
    426 }
    427 
    428 
    429 #define ucol_countBytes(value, noOfBytes)   \
    430 {                               \
    431     uint32_t mask = 0xFFFFFFFF;   \
    432     (noOfBytes) = 0;              \
    433     while(mask != 0) {            \
    434     if(((value) & mask) != 0) { \
    435     (noOfBytes)++;            \
    436     }                           \
    437     mask >>= 8;                 \
    438     }                             \
    439 }
    440 
    441 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
    442     if(U_SUCCESS(*status)) {
    443         g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
    444     }
    445     return g->current;
    446 }
    447 
    448 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
    449     /* TODO: rename to enum names */
    450     uint32_t high, low, count=1;
    451     uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
    452 
    453     if(strength == UCOL_SECONDARY) {
    454         low = UCOL_COMMON_TOP2<<24;
    455         high = 0xFFFFFFFF;
    456         count = 0xFF - UCOL_COMMON_TOP2;
    457     } else {
    458         low = UCOL_BYTE_COMMON << 24; //0x05000000;
    459         high = 0x40000000;
    460         count = 0x40 - UCOL_BYTE_COMMON;
    461     }
    462 
    463     if(tok->next != NULL && tok->next->strength == strength) {
    464         count = tok->next->toInsert;
    465     }
    466 
    467     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
    468     g->current = UCOL_BYTE_COMMON<<24;
    469 
    470     if(g->noOfRanges == 0) {
    471         *status = U_INTERNAL_PROGRAM_ERROR;
    472     }
    473     return g->current;
    474 }
    475 
    476 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
    477     uint32_t strength = tok->strength;
    478     uint32_t low = lows[fStrength*3+strength];
    479     uint32_t high = highs[fStrength*3+strength];
    480     uint32_t maxByte = 0;
    481     if(strength == UCOL_TERTIARY) {
    482         maxByte = 0x3F;
    483     } else if(strength == UCOL_PRIMARY) {
    484         maxByte = 0xFE;
    485     } else {
    486         maxByte = 0xFF;
    487     }
    488 
    489     uint32_t count = tok->toInsert;
    490 
    491     if(low >= high && strength > UCOL_PRIMARY) {
    492         int32_t s = strength;
    493         for(;;) {
    494             s--;
    495             if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
    496                 if(strength == UCOL_SECONDARY) {
    497                     if (low < UCOL_COMMON_TOP2<<24 ) {
    498                        // Override if low range is less than UCOL_COMMON_TOP2.
    499 		        low = UCOL_COMMON_TOP2<<24;
    500                     }
    501                     high = 0xFFFFFFFF;
    502                 } else {
    503                     // Override if low range is less than UCOL_COMMON_BOT3.
    504 		    if ( low < UCOL_COMMON_BOT3<<24 ) {
    505                         low = UCOL_COMMON_BOT3<<24;
    506 		    }
    507                     high = 0x40000000;
    508                 }
    509                 break;
    510             }
    511             if(s<0) {
    512                 *status = U_INTERNAL_PROGRAM_ERROR;
    513                 return 0;
    514             }
    515         }
    516     }
    517 
    518     if(low < 0x02000000) {
    519         // We must not use CE weight byte 02, so we set it as the minimum lower bound.
    520         // See http://site.icu-project.org/design/collation/bytes
    521         low = 0x02000000;
    522     }
    523 
    524     if(strength == UCOL_SECONDARY) { /* similar as simple */
    525         if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
    526             low = UCOL_COMMON_TOP2<<24;
    527         }
    528         if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
    529             high = UCOL_COMMON_TOP2<<24;
    530         }
    531         if(low < (UCOL_COMMON_BOT2<<24)) {
    532             g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
    533             g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
    534             //g->current = UCOL_COMMON_BOT2<<24;
    535             return g->current;
    536         }
    537     }
    538 
    539     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
    540     if(g->noOfRanges == 0) {
    541         *status = U_INTERNAL_PROGRAM_ERROR;
    542     }
    543     g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
    544     return g->current;
    545 }
    546 
    547 static
    548 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
    549     uint32_t i = 0;
    550     UChar c;
    551 
    552     if(U_FAILURE(*status)) {
    553         return 0;
    554     }
    555 
    556     if(sourceLen > resLen) {
    557         *status = U_MEMORY_ALLOCATION_ERROR;
    558         return 0;
    559     }
    560 
    561     for(i = 0; i < sourceLen; i++) {
    562         c = source[i];
    563         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
    564             switch(c - 0x3000) {
    565             case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
    566             case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
    567                 c++;
    568                 break;
    569             case 0xF5:
    570                 c = 0x30AB;
    571                 break;
    572             case 0xF6:
    573                 c = 0x30B1;
    574                 break;
    575             }
    576         }
    577         resBuf[i] = c;
    578     }
    579     return sourceLen;
    580 }
    581 
    582 static
    583 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
    584     uint32_t i = 0;
    585     UChar c;
    586 
    587     if(U_FAILURE(*status)) {
    588         return 0;
    589     }
    590 
    591     if(sourceLen > resLen) {
    592         *status = U_MEMORY_ALLOCATION_ERROR;
    593         return 0;
    594     }
    595 
    596     for(i = 0; i < sourceLen; i++) {
    597         c = source[i];
    598         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
    599             switch(c - 0x3000) {
    600             case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
    601             case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
    602                 c--;
    603                 break;
    604             case 0xAB:
    605                 c = 0x30F5;
    606                 break;
    607             case 0xB1:
    608                 c = 0x30F6;
    609                 break;
    610             }
    611         }
    612         resBuf[i] = c;
    613     }
    614     return sourceLen;
    615 }
    616 
    617 U_NAMESPACE_BEGIN
    618 
    619 static
    620 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
    621     uint32_t i = 0;
    622     UChar n[128];
    623     uint32_t nLen = 0;
    624     uint32_t uCount = 0, lCount = 0;
    625 
    626     collIterate s;
    627     uint32_t order = 0;
    628 
    629     if(U_FAILURE(*status)) {
    630         return UCOL_LOWER_CASE;
    631     }
    632 
    633     nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
    634     if(U_SUCCESS(*status)) {
    635         for(i = 0; i < nLen; i++) {
    636             uprv_init_collIterate(UCA, &n[i], 1, &s, status);
    637             order = ucol_getNextCE(UCA, &s, status);
    638             if(isContinuation(order)) {
    639                 *status = U_INTERNAL_PROGRAM_ERROR;
    640                 return UCOL_LOWER_CASE;
    641             }
    642             if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
    643                 uCount++;
    644             } else {
    645                 if(u_islower(n[i])) {
    646                     lCount++;
    647                 } else if(U_SUCCESS(*status)) {
    648                     UChar sk[1], lk[1];
    649                     u_toSmallKana(&n[i], 1, sk, 1, status);
    650                     u_toLargeKana(&n[i], 1, lk, 1, status);
    651                     if(sk[0] == n[i] && lk[0] != n[i]) {
    652                         lCount++;
    653                     }
    654                 }
    655             }
    656         }
    657     }
    658 
    659     if(uCount != 0 && lCount != 0) {
    660         return UCOL_MIXED_CASE;
    661     } else if(uCount != 0) {
    662         return UCOL_UPPER_CASE;
    663     } else {
    664         return UCOL_LOWER_CASE;
    665     }
    666 }
    667 
    668 
    669 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
    670     /* this one makes the table and stuff */
    671     uint32_t noOfBytes[3];
    672     uint32_t i;
    673 
    674     for(i = 0; i<3; i++) {
    675         ucol_countBytes(CEparts[i], noOfBytes[i]);
    676     }
    677 
    678     /* Here we have to pack CEs from parts */
    679 
    680     uint32_t CEi = 0;
    681     uint32_t value = 0;
    682 
    683     while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
    684         if(CEi > 0) {
    685             value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
    686         } else {
    687             value = 0;
    688         }
    689 
    690         if(2*CEi<noOfBytes[0]) {
    691             value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
    692         }
    693         if(CEi<noOfBytes[1]) {
    694             value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
    695         }
    696         if(CEi<noOfBytes[2]) {
    697             value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
    698         }
    699         tok->CEs[CEi] = value;
    700         CEi++;
    701     }
    702     if(CEi == 0) { /* totally ignorable */
    703         tok->noOfCEs = 1;
    704         tok->CEs[0] = 0;
    705     } else { /* there is at least something */
    706         tok->noOfCEs = CEi;
    707     }
    708 
    709 
    710     // we want to set case bits here and now, not later.
    711     // Case bits handling
    712     if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
    713         tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
    714         int32_t cSize = (tok->source & 0xFF000000) >> 24;
    715         UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
    716 
    717         if(cSize > 1) {
    718             // Do it manually
    719             tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
    720         } else {
    721             // Copy it from the UCA
    722             uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
    723             tok->CEs[0] |= (caseCE & 0xC0);
    724         }
    725     }
    726 
    727 #if UCOL_DEBUG==2
    728     fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
    729     for(i = 0; i<tok->noOfCEs; i++) {
    730         fprintf(stderr, "%08X ", tok->CEs[i]);
    731     }
    732     fprintf(stderr, "\n");
    733 #endif
    734 }
    735 
    736 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
    737     ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
    738     uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
    739 
    740     UColToken *tok = lh->last;
    741     uint32_t t[UCOL_STRENGTH_LIMIT];
    742 
    743     uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
    744 
    745     tok->toInsert = 1;
    746     t[tok->strength] = 1;
    747 
    748     while(tok->previous != NULL) {
    749         if(tok->previous->strength < tok->strength) { /* going up */
    750             t[tok->strength] = 0;
    751             t[tok->previous->strength]++;
    752         } else if(tok->previous->strength > tok->strength) { /* going down */
    753             t[tok->previous->strength] = 1;
    754         } else {
    755             t[tok->strength]++;
    756         }
    757         tok=tok->previous;
    758         tok->toInsert = t[tok->strength];
    759     }
    760 
    761     tok->toInsert = t[tok->strength];
    762     ucol_inv_getGapPositions(src, lh, status);
    763 
    764 #if UCOL_DEBUG
    765     fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
    766     int32_t j = 2;
    767     for(j = 2; j >= 0; j--) {
    768         fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
    769         fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
    770     }
    771     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
    772 
    773     do {
    774         fprintf(stderr,"%i", tok->strength);
    775         tok = tok->next;
    776     } while(tok != NULL);
    777     fprintf(stderr, "\n");
    778 
    779     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
    780 
    781     do {
    782         fprintf(stderr,"%i", tok->toInsert);
    783         tok = tok->next;
    784     } while(tok != NULL);
    785 #endif
    786 
    787     tok = lh->first;
    788     uint32_t fStrength = UCOL_IDENTICAL;
    789     uint32_t initStrength = UCOL_IDENTICAL;
    790 
    791 
    792     CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
    793     CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
    794     CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
    795 
    796     while (tok != NULL && U_SUCCESS(*status)) {
    797         fStrength = tok->strength;
    798         if(fStrength < initStrength) {
    799             initStrength = fStrength;
    800             if(lh->pos[fStrength] == -1) {
    801                 while(lh->pos[fStrength] == -1 && fStrength > 0) {
    802                     fStrength--;
    803                 }
    804                 if(lh->pos[fStrength] == -1) {
    805                     *status = U_INTERNAL_PROGRAM_ERROR;
    806                     return;
    807                 }
    808             }
    809             if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
    810                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
    811                 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
    812                 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
    813                 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
    814             } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
    815                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
    816                 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
    817                 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
    818                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    819             } else { /* primaries */
    820                 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
    821                 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
    822                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
    823                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    824             }
    825         } else {
    826             if(tok->strength == UCOL_TERTIARY) {
    827                 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
    828             } else if(tok->strength == UCOL_SECONDARY) {
    829                 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
    830                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    831             } else if(tok->strength == UCOL_PRIMARY) {
    832                 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
    833                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
    834                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    835             }
    836         }
    837         ucol_doCE(src, CEparts, tok, status);
    838         tok = tok->next;
    839     }
    840 }
    841 
    842 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
    843     UCAElements el;
    844     UColToken *tok = lh->first;
    845     UColToken *expt = NULL;
    846     uint32_t i = 0, j = 0;
    847     UChar32 fcdHighStart;
    848     const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
    849 
    850     while(tok != NULL && U_SUCCESS(*status)) {
    851         /* first, check if there are any expansions */
    852         /* if there are expansions, we need to do a little bit more processing */
    853         /* since parts of expansion can be tailored, while others are not */
    854         if(tok->expansion != 0) {
    855             uint32_t len = tok->expansion >> 24;
    856             uint32_t currentSequenceLen = len;
    857             uint32_t expOffset = tok->expansion & 0x00FFFFFF;
    858             //uint32_t exp = currentSequenceLen | expOffset;
    859             UColToken exp;
    860             exp.source = currentSequenceLen | expOffset;
    861             exp.rulesToParseHdl = &(src->source);
    862 
    863             while(len > 0) {
    864                 currentSequenceLen = len;
    865                 while(currentSequenceLen > 0) {
    866                     exp.source = (currentSequenceLen << 24) | expOffset;
    867                     if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
    868                         uint32_t noOfCEsToCopy = expt->noOfCEs;
    869                         for(j = 0; j<noOfCEsToCopy; j++) {
    870                             tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
    871                         }
    872                         tok->noOfExpCEs += noOfCEsToCopy;
    873                         // Smart people never try to add codepoints and CEs.
    874                         // For some odd reason, it won't work.
    875                         expOffset += currentSequenceLen; //noOfCEsToCopy;
    876                         len -= currentSequenceLen; //noOfCEsToCopy;
    877                         break;
    878                     } else {
    879                         currentSequenceLen--;
    880                     }
    881                 }
    882                 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
    883                     /* will have to get one from UCA */
    884                     /* first, get the UChars from the rules */
    885                     /* then pick CEs out until there is no more and stuff them into expansion */
    886                     collIterate s;
    887                     uint32_t order = 0;
    888                     uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
    889 
    890                     for(;;) {
    891                         order = ucol_getNextCE(src->UCA, &s, status);
    892                         if(order == UCOL_NO_MORE_CES) {
    893                             break;
    894                         }
    895                         tok->expCEs[tok->noOfExpCEs++] = order;
    896                     }
    897                     expOffset++;
    898                     len--;
    899                 }
    900             }
    901         } else {
    902             tok->noOfExpCEs = 0;
    903         }
    904 
    905         /* set the ucaelement with obtained values */
    906         el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
    907         /* copy CEs */
    908         for(i = 0; i<tok->noOfCEs; i++) {
    909             el.CEs[i] = tok->CEs[i];
    910         }
    911         for(i = 0; i<tok->noOfExpCEs; i++) {
    912             el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
    913         }
    914 
    915         /* copy UChars */
    916         // We kept prefix and source kind of together, as it is a kind of a contraction.
    917         // However, now we have to slice the prefix off the main thing -
    918         el.prefix = el.prefixChars;
    919         el.cPoints = el.uchars;
    920         if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
    921             // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
    922             // decomposed elements to the unsaf table.
    923             el.prefixSize = tok->prefix>>24;
    924             uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
    925 
    926             el.cSize = (tok->source >> 24)-(tok->prefix>>24);
    927             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
    928         } else {
    929             el.prefixSize = 0;
    930             *el.prefix = 0;
    931 
    932             el.cSize = (tok->source >> 24);
    933             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
    934         }
    935         if(src->UCA != NULL) {
    936             for(i = 0; i<el.cSize; i++) {
    937                 if(UCOL_ISJAMO(el.cPoints[i])) {
    938                     t->image->jamoSpecial = TRUE;
    939                 }
    940             }
    941             if (!src->buildCCTabFlag && el.cSize > 0) {
    942                 // Check the trailing canonical combining class (tccc) of the last character.
    943                 const UChar *s = el.cPoints + el.cSize;
    944                 uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cPoints, s);
    945                 if ((fcd & 0xff) != 0) {
    946                     src->buildCCTabFlag = TRUE;
    947                 }
    948             }
    949         }
    950 
    951         /* and then, add it */
    952 #if UCOL_DEBUG==2
    953         fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
    954 #endif
    955         uprv_uca_addAnElement(t, &el, status);
    956 
    957 #if UCOL_DEBUG_DUPLICATES
    958         if(*status != U_ZERO_ERROR) {
    959             fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
    960             *status = U_ZERO_ERROR;
    961         }
    962 #endif
    963 
    964         tok = tok->next;
    965     }
    966 }
    967 
    968 U_CDECL_BEGIN
    969 static UBool U_CALLCONV
    970 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
    971     UErrorCode status = U_ZERO_ERROR;
    972     tempUCATable *t = (tempUCATable *)context;
    973     if(value == 0) {
    974         while(start < limit) {
    975             uint32_t CE = utrie_get32(t->mapping, start, NULL);
    976             if(CE == UCOL_NOT_FOUND) {
    977                 UCAElements el;
    978                 el.isThai = FALSE;
    979                 el.prefixSize = 0;
    980                 el.prefixChars[0] = 0;
    981                 el.prefix = el.prefixChars;
    982                 el.cPoints = el.uchars;
    983 
    984                 el.cSize = 0;
    985                 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
    986 
    987                 el.noOfCEs = 1;
    988                 el.CEs[0] = 0;
    989                 uprv_uca_addAnElement(t, &el, &status);
    990 
    991             }
    992             start++;
    993         }
    994     }
    995     if(U_FAILURE(status)) {
    996         return FALSE;
    997     } else {
    998         return TRUE;
    999     }
   1000 }
   1001 U_CDECL_END
   1002 
   1003 static void
   1004 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
   1005                                UChar32 start, UChar32 end,
   1006                                UErrorCode *status)
   1007 {
   1008     //UChar decomp[256];
   1009     uint32_t CE = UCOL_NOT_FOUND;
   1010     UChar32 u = 0;
   1011     UCAElements el;
   1012     el.isThai = FALSE;
   1013     el.prefixSize = 0;
   1014     el.prefixChars[0] = 0;
   1015     collIterate colIt;
   1016 
   1017     if(U_SUCCESS(*status)) {
   1018         for(u = start; u<=end; u++) {
   1019             if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
   1020                 /* this test is for contractions that are missing the starting element. */
   1021                 || ((isCntTableElement(CE)) &&
   1022                 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
   1023                 )
   1024             {
   1025                 el.cSize = 0;
   1026                 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
   1027                 //decomp[0] = (UChar)u;
   1028                 //el.uchars[0] = (UChar)u;
   1029                 el.cPoints = el.uchars;
   1030                 //el.cSize = 1;
   1031                 el.noOfCEs = 0;
   1032                 el.prefix = el.prefixChars;
   1033                 el.prefixSize = 0;
   1034                 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
   1035                 // We actually want to check whether this element is a special
   1036                 // If it is an implicit element (hangul, CJK - we want to copy the
   1037                 // special, not the resolved CEs) - for hangul, copying resolved
   1038                 // would just make things the same (there is an expansion and it
   1039                 // takes approximately the same amount of time to resolve as
   1040                 // falling back to the UCA).
   1041                 /*
   1042                 UTRIE_GET32(src->UCA->mapping, u, CE);
   1043                 tag = getCETag(CE);
   1044                 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
   1045                 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
   1046                 || tag == LEAD_SURROGATE_TAG) {
   1047                 el.CEs[el.noOfCEs++] = CE;
   1048                 } else {
   1049                 */
   1050                 // It turns out that it does not make sense to keep implicits
   1051                 // unresolved. The cost of resolving them is big enough so that
   1052                 // it doesn't make any difference whether we have to go to the UCA
   1053                 // or not.
   1054                 {
   1055                     uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
   1056                     while(CE != UCOL_NO_MORE_CES) {
   1057                         CE = ucol_getNextCE(src->UCA, &colIt, status);
   1058                         if(CE != UCOL_NO_MORE_CES) {
   1059                             el.CEs[el.noOfCEs++] = CE;
   1060                         }
   1061                     }
   1062                 }
   1063                 uprv_uca_addAnElement(t, &el, status);
   1064             }
   1065         }
   1066     }
   1067 }
   1068 
   1069 U_NAMESPACE_END
   1070 
   1071 U_CFUNC UCATableHeader *
   1072 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
   1073     U_NAMESPACE_USE
   1074 
   1075     uint32_t i = 0;
   1076     if(U_FAILURE(*status)) {
   1077         return NULL;
   1078     }
   1079     /*
   1080     2.  Eliminate the negative lists by doing the following for each non-null negative list:
   1081     o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
   1082     create new ListHeader X
   1083     o   reverse the list, add to the end of X's positive list. Reset the strength of the
   1084     first item you add, based on the stronger strength levels of the two lists.
   1085     */
   1086     /*
   1087     3.  For each ListHeader with a non-null positive list:
   1088     */
   1089     /*
   1090     o   Find all character strings with CEs between the baseCE and the
   1091     next/previous CE, at the strength of the first token. Add these to the
   1092     tailoring.
   1093     ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
   1094     tailoring has & x < z...
   1095     ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
   1096     */
   1097     /* It is possible that this part should be done even while constructing list */
   1098     /* The problem is that it is unknown what is going to be the strongest weight */
   1099     /* So we might as well do it here */
   1100 
   1101     /*
   1102     o   Allocate CEs for each token in the list, based on the total number N of the
   1103     largest level difference, and the gap G between baseCE and nextCE at that
   1104     level. The relation * between the last item and nextCE is the same as the
   1105     strongest strength.
   1106     o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
   1107     ? There are 3 primary items: a, d, e. Fit them into the primary gap.
   1108     Then fit b and c into the secondary gap between a and d, then fit q
   1109     into the tertiary gap between b and c.
   1110 
   1111     o   Example: baseCE << b <<< q << c * nextCE(X,2)
   1112     ? There are 2 secondary items: b, c. Fit them into the secondary gap.
   1113     Then fit q into the tertiary gap between b and c.
   1114     o   When incrementing primary values, we will not cross high byte
   1115     boundaries except where there is only a single-byte primary. That is to
   1116     ensure that the script reordering will continue to work.
   1117     */
   1118     UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
   1119     /* test for NULL */
   1120     if (image == NULL) {
   1121         *status = U_MEMORY_ALLOCATION_ERROR;
   1122         return NULL;
   1123     }
   1124     uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
   1125 
   1126     for(i = 0; i<src->resultLen; i++) {
   1127         /* now we need to generate the CEs */
   1128         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
   1129         /* According to strength                                                          */
   1130         if(U_SUCCESS(*status)) {
   1131             if(src->lh[i].first) { // if there are any elements
   1132                 // due to the way parser works, subsequent tailorings
   1133                 // may remove all the elements from a sequence, therefore
   1134                 // leaving an empty tailoring sequence.
   1135                 ucol_initBuffers(src, &src->lh[i], status);
   1136             }
   1137         }
   1138         if(U_FAILURE(*status)) {
   1139             uprv_free(image);
   1140             return NULL;
   1141         }
   1142     }
   1143 
   1144     if(src->varTop != NULL) { /* stuff the variable top value */
   1145         src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
   1146         /* remove it from the list */
   1147         if(src->varTop->listHeader->first == src->varTop) { /* first in list */
   1148             src->varTop->listHeader->first = src->varTop->next;
   1149         }
   1150         if(src->varTop->listHeader->last == src->varTop) { /* first in list */
   1151             src->varTop->listHeader->last = src->varTop->previous;
   1152         }
   1153         if(src->varTop->next != NULL) {
   1154             src->varTop->next->previous = src->varTop->previous;
   1155         }
   1156         if(src->varTop->previous != NULL) {
   1157             src->varTop->previous->next = src->varTop->next;
   1158         }
   1159     }
   1160 
   1161 
   1162     tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
   1163     if(U_FAILURE(*status)) {
   1164         uprv_free(image);
   1165         return NULL;
   1166     }
   1167 
   1168 
   1169     /* After this, we have assigned CE values to all regular CEs      */
   1170     /* now we will go through list once more and resolve expansions,  */
   1171     /* make UCAElements structs and add them to table                 */
   1172     for(i = 0; i<src->resultLen; i++) {
   1173         /* now we need to generate the CEs */
   1174         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
   1175         /* According to strength                                                          */
   1176         if(U_SUCCESS(*status)) {
   1177             ucol_createElements(src, t, &src->lh[i], status);
   1178         }
   1179     }
   1180 
   1181     UCAElements el;
   1182     el.isThai = FALSE;
   1183     el.prefixSize = 0;
   1184     el.prefixChars[0] = 0;
   1185 
   1186     /* add latin-1 stuff */
   1187     ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
   1188 
   1189     /* add stuff for copying */
   1190     if(src->copySet != NULL) {
   1191         int32_t i = 0;
   1192         UnicodeSet *set = (UnicodeSet *)src->copySet;
   1193         for(i = 0; i < set->getRangeCount(); i++) {
   1194             ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
   1195         }
   1196     }
   1197 
   1198     if(U_SUCCESS(*status)) {
   1199         /* copy contractions from the UCA - this is felt mostly for cyrillic*/
   1200 
   1201         uint32_t tailoredCE = UCOL_NOT_FOUND;
   1202         //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
   1203         UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
   1204         UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
   1205         // Check for null pointer
   1206         if (ucaEl == NULL) {
   1207         	*status = U_MEMORY_ALLOCATION_ERROR;
   1208         	return NULL;
   1209         }
   1210         while(*conts != 0) {
   1211             /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
   1212             tailoredCE = utrie_get32(t->mapping, *conts, NULL);
   1213             if(tailoredCE != UCOL_NOT_FOUND) {
   1214                 UBool needToAdd = TRUE;
   1215                 if(isCntTableElement(tailoredCE)) {
   1216                     if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) {
   1217                         needToAdd = FALSE;
   1218                     }
   1219                 }
   1220                 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
   1221                     UCAElements elm;
   1222                     elm.cPoints = el.uchars;
   1223                     elm.noOfCEs = 0;
   1224                     elm.uchars[0] = *conts;
   1225                     elm.uchars[1] = 0;
   1226                     elm.cSize = 1;
   1227                     elm.prefixChars[0] = *(conts+2);
   1228                     elm.isThai = FALSE;
   1229                     elm.prefix = elm.prefixChars;
   1230                     elm.prefixSize = 1;
   1231                     UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
   1232                     if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
   1233                         needToAdd = TRUE;
   1234                     }
   1235                 }
   1236                 if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
   1237                     needToAdd = FALSE;
   1238                 }
   1239 
   1240                 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
   1241                     if (*(conts+1) != 0) {  // contractions
   1242                         el.prefix = el.prefixChars;
   1243                         el.prefixSize = 0;
   1244                         el.cPoints = el.uchars;
   1245                         el.noOfCEs = 0;
   1246                         el.uchars[0] = *conts;
   1247                         el.uchars[1] = *(conts+1);
   1248                         if(*(conts+2)!=0) {
   1249                             el.uchars[2] = *(conts+2);
   1250                             el.cSize = 3;
   1251                         } else {
   1252                             el.cSize = 2;
   1253                         }
   1254                         ucol_setText(ucaEl, el.uchars, el.cSize, status);
   1255                     }
   1256                     else { // pre-context character
   1257                         UChar str[4] = { 0 };
   1258                         int32_t len=0;
   1259                         int32_t preKeyLen=0;
   1260 
   1261                         el.cPoints = el.uchars;
   1262                         el.noOfCEs = 0;
   1263                         el.uchars[0] = *conts;
   1264                         el.uchars[1] = 0;
   1265                         el.cSize = 1;
   1266                         el.prefixChars[0] = *(conts+2);
   1267                         el.prefix = el.prefixChars;
   1268                         el.prefixSize = 1;
   1269                         if (el.prefixChars[0]!=0) {
   1270                             // get CE of prefix character first
   1271                             str[0]=el.prefixChars[0];
   1272                             str[1]=0;
   1273                             ucol_setText(ucaEl, str, 1, status);
   1274                             while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
   1275                                     != UCOL_NULLORDER) {
   1276                                 preKeyLen++;  // count number of keys for prefix character
   1277                             }
   1278                             str[len++] = el.prefixChars[0];
   1279                         }
   1280 
   1281                         str[len++] = el.uchars[0];
   1282                         str[len]=0;
   1283                         ucol_setText(ucaEl, str, len, status);
   1284                         // Skip the keys for prefix character, then copy the rest to el.
   1285                         while ((preKeyLen-->0) &&
   1286                                (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
   1287                             continue;
   1288                         }
   1289 
   1290                     }
   1291                     while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
   1292                         el.noOfCEs++;
   1293                     }
   1294                     uprv_uca_addAnElement(t, &el, status);
   1295                 }
   1296 
   1297             } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
   1298                 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
   1299             }
   1300             conts+=3;
   1301         }
   1302         ucol_closeElements(ucaEl);
   1303     }
   1304 
   1305     // Add completely ignorable elements
   1306     utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
   1307 
   1308     // add tailoring characters related canonical closures
   1309     uprv_uca_canonicalClosure(t, src, NULL, status);
   1310 
   1311     /* still need to produce compatibility closure */
   1312 
   1313     UCATableHeader *myData = uprv_uca_assembleTable(t, status);
   1314 
   1315     uprv_uca_closeTempTable(t);
   1316     uprv_free(image);
   1317 
   1318     return myData;
   1319 }
   1320 
   1321 U_CDECL_BEGIN
   1322 static UBool U_CALLCONV
   1323 ucol_bld_cleanup(void)
   1324 {
   1325     udata_close(invUCA_DATA_MEM);
   1326     invUCA_DATA_MEM = NULL;
   1327     _staticInvUCA = NULL;
   1328     return TRUE;
   1329 }
   1330 U_CDECL_END
   1331 
   1332 U_CAPI const InverseUCATableHeader * U_EXPORT2
   1333 ucol_initInverseUCA(UErrorCode *status)
   1334 {
   1335     if(U_FAILURE(*status)) return NULL;
   1336 
   1337     UBool needsInit;
   1338     UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
   1339 
   1340     if(needsInit) {
   1341         InverseUCATableHeader *newInvUCA = NULL;
   1342         UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
   1343 
   1344         if(U_FAILURE(*status)) {
   1345             if (result) {
   1346                 udata_close(result);
   1347             }
   1348             // This is not needed, as we are talking about
   1349             // memory we got from UData
   1350             //uprv_free(newInvUCA);
   1351         }
   1352 
   1353         if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
   1354             newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
   1355             UCollator *UCA = ucol_initUCA(status);
   1356             // UCA versions of UCA and inverse UCA should match
   1357             if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
   1358                 *status = U_INVALID_FORMAT_ERROR;
   1359                 udata_close(result);
   1360                 return NULL;
   1361             }
   1362 
   1363             umtx_lock(NULL);
   1364             if(_staticInvUCA == NULL) {
   1365                 invUCA_DATA_MEM = result;
   1366                 _staticInvUCA = newInvUCA;
   1367                 result = NULL;
   1368                 newInvUCA = NULL;
   1369             }
   1370             umtx_unlock(NULL);
   1371 
   1372             if(newInvUCA != NULL) {
   1373                 udata_close(result);
   1374                 // This is not needed, as we are talking about
   1375                 // memory we got from UData
   1376                 //uprv_free(newInvUCA);
   1377             }
   1378             else {
   1379                 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
   1380             }
   1381         }
   1382     }
   1383     return _staticInvUCA;
   1384 }
   1385 
   1386 /* This is the data that is used for non-script reordering codes. These _must_ be kept
   1387  * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
   1388  */
   1389 static const char* ReorderingTokenNames[] = {
   1390     "SPACE",
   1391     "PUNCT",
   1392     "SYMBOL",
   1393     "CURRENCY",
   1394     "DIGIT",
   1395     NULL
   1396 };
   1397 
   1398 static void toUpper(const char* src, char* dst, uint32_t length) {
   1399    for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
   1400        *dst = toupper(*src);
   1401    }
   1402    *dst = '\0';
   1403 }
   1404 
   1405 U_INTERNAL int32_t U_EXPORT2
   1406 ucol_findReorderingEntry(const char* name) {
   1407     char buffer[32];
   1408     toUpper(name, buffer, 32);
   1409     for (uint32_t entry = 0; ReorderingTokenNames[entry] != NULL; entry++) {
   1410         if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
   1411             return entry + UCOL_REORDER_CODE_FIRST;
   1412         }
   1413     }
   1414     return USCRIPT_INVALID_CODE;
   1415 }
   1416 
   1417 #endif /* #if !UCONFIG_NO_COLLATION */
   1418