Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2001-2013, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucol_bld.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created 02/22/2001
     14 *   created by: Vladimir Weinstein
     15 *
     16 * This module builds a collator based on the rule set.
     17 *
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 
     22 #if !UCONFIG_NO_COLLATION
     23 
     24 #include "unicode/ucoleitr.h"
     25 #include "unicode/udata.h"
     26 #include "unicode/uchar.h"
     27 #include "unicode/uniset.h"
     28 #include "unicode/uscript.h"
     29 #include "unicode/ustring.h"
     30 #include "unicode/utf16.h"
     31 #include "normalizer2impl.h"
     32 #include "uassert.h"
     33 #include "ucol_bld.h"
     34 #include "ucol_elm.h"
     35 #include "ucol_cnt.h"
     36 #include "ucln_in.h"
     37 #include "umutex.h"
     38 #include "cmemory.h"
     39 #include "cstring.h"
     40 
     41 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     42 
     43 static const InverseUCATableHeader* _staticInvUCA = NULL;
     44 static UDataMemory* invUCA_DATA_MEM = NULL;
     45 static icu::UInitOnce gStaticInvUCAInitOnce = U_INITONCE_INITIALIZER;
     46 
     47 U_CDECL_BEGIN
     48 static UBool U_CALLCONV
     49 isAcceptableInvUCA(void * /*context*/,
     50                    const char * /*type*/, const char * /*name*/,
     51                    const UDataInfo *pInfo)
     52 {
     53     /* context, type & name are intentionally not used */
     54     if( pInfo->size>=20 &&
     55         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
     56         pInfo->charsetFamily==U_CHARSET_FAMILY &&
     57         pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
     58         pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
     59         pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
     60         pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
     61         pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
     62         pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
     63         //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
     64         //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
     65         //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
     66         )
     67     {
     68         // TODO: Check that the invuca data version (pInfo->dataVersion)
     69         // matches the ucadata version.
     70         return TRUE;
     71     } else {
     72         return FALSE;
     73     }
     74 }
     75 U_CDECL_END
     76 
     77 /*
     78 * Takes two CEs (lead and continuation) and
     79 * compares them as CEs should be compared:
     80 * primary vs. primary, secondary vs. secondary
     81 * tertiary vs. tertiary
     82 */
     83 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
     84     uint32_t s1 = source0, s2, t1 = target0, t2;
     85     if(isContinuation(source1)) {
     86         s2 = source1;
     87     } else {
     88         s2 = 0;
     89     }
     90     if(isContinuation(target1)) {
     91         t2 = target1;
     92     } else {
     93         t2 = 0;
     94     }
     95 
     96     uint32_t s = 0, t = 0;
     97     if(s1 == t1 && s2 == t2) {
     98         return 0;
     99     }
    100     s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
    101     t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
    102     if(s < t) {
    103         return -1;
    104     } else if(s > t) {
    105         return 1;
    106     } else {
    107         s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
    108         t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
    109         if(s < t) {
    110             return -1;
    111         } else if(s > t) {
    112             return 1;
    113         } else {
    114             s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
    115             t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
    116             if(s < t) {
    117                 return -1;
    118             } else {
    119                 return 1;
    120             }
    121         }
    122     }
    123 }
    124 
    125 static
    126 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
    127     uint32_t bottom = 0, top = src->invUCA->tableSize;
    128     uint32_t i = 0;
    129     uint32_t first = 0, second = 0;
    130     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    131     int32_t res = 0;
    132 
    133     while(bottom < top-1) {
    134         i = (top+bottom)/2;
    135         first = *(CETable+3*i);
    136         second = *(CETable+3*i+1);
    137         res = compareCEs(first, second, CE, SecondCE);
    138         if(res > 0) {
    139             top = i;
    140         } else if(res < 0) {
    141             bottom = i;
    142         } else {
    143             break;
    144         }
    145     }
    146 
    147     /* weiv:                                                  */
    148     /* in searching for elements, I have removed the failure  */
    149     /* The reason for this is that the builder does not rely  */
    150     /* on search mechanism telling it that it didn't find an  */
    151     /* element. However, indirect positioning relies on being */
    152     /* able to find the elements around any CE, even if it is */
    153     /* not defined in the UCA. */
    154     return i;
    155     /*
    156     if((first == CE && second == SecondCE)) {
    157     return i;
    158     } else {
    159     return -1;
    160     }
    161     */
    162 }
    163 
    164 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
    165     0xFFFF0000,
    166     0xFFFFFF00,
    167     0xFFFFFFFF
    168 };
    169 
    170 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
    171                                             uint32_t CE, uint32_t contCE,
    172                                             uint32_t *nextCE, uint32_t *nextContCE,
    173                                             uint32_t strength)
    174 {
    175     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    176     int32_t iCE;
    177 
    178     iCE = ucol_inv_findCE(src, CE, contCE);
    179 
    180     if(iCE<0) {
    181         *nextCE = UCOL_NOT_FOUND;
    182         return -1;
    183     }
    184 
    185     CE &= strengthMask[strength];
    186     contCE &= strengthMask[strength];
    187 
    188     *nextCE = CE;
    189     *nextContCE = contCE;
    190 
    191     while((*nextCE  & strengthMask[strength]) == CE
    192         && (*nextContCE  & strengthMask[strength]) == contCE)
    193     {
    194         *nextCE = (*(CETable+3*(++iCE)));
    195         *nextContCE = (*(CETable+3*(iCE)+1));
    196     }
    197 
    198     return iCE;
    199 }
    200 
    201 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
    202                                             uint32_t CE, uint32_t contCE,
    203                                             uint32_t *prevCE, uint32_t *prevContCE,
    204                                             uint32_t strength)
    205 {
    206     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    207     int32_t iCE;
    208 
    209     iCE = ucol_inv_findCE(src, CE, contCE);
    210 
    211     if(iCE<0) {
    212         *prevCE = UCOL_NOT_FOUND;
    213         return -1;
    214     }
    215 
    216     CE &= strengthMask[strength];
    217     contCE &= strengthMask[strength];
    218 
    219     *prevCE = CE;
    220     *prevContCE = contCE;
    221 
    222     while((*prevCE  & strengthMask[strength]) == CE
    223         && (*prevContCE  & strengthMask[strength])== contCE
    224         && iCE > 0) /* this condition should prevent falling off the edge of the world */
    225     {
    226         /* here, we end up in a singularity - zero */
    227         *prevCE = (*(CETable+3*(--iCE)));
    228         *prevContCE = (*(CETable+3*(iCE)+1));
    229     }
    230 
    231     return iCE;
    232 }
    233 
    234 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
    235                                                        uint32_t prevCE, uint32_t prevContCE)
    236 {
    237     if(prevCE == CE && prevContCE == contCE) {
    238         return UCOL_IDENTICAL;
    239     }
    240     if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
    241         || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
    242     {
    243         return UCOL_PRIMARY;
    244     }
    245     if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
    246         || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
    247     {
    248         return UCOL_SECONDARY;
    249     }
    250     return UCOL_TERTIARY;
    251 }
    252 
    253 
    254 /*static
    255 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
    256 
    257     uint32_t CE = lh->baseCE;
    258     uint32_t SecondCE = lh->baseContCE;
    259 
    260     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    261     uint32_t previousCE, previousContCE;
    262     int32_t iCE;
    263 
    264     iCE = ucol_inv_findCE(src, CE, SecondCE);
    265 
    266     if(iCE<0) {
    267         return -1;
    268     }
    269 
    270     CE &= strengthMask[strength];
    271     SecondCE &= strengthMask[strength];
    272 
    273     previousCE = CE;
    274     previousContCE = SecondCE;
    275 
    276     while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
    277         previousCE = (*(CETable+3*(--iCE)));
    278         previousContCE = (*(CETable+3*(iCE)+1));
    279     }
    280     lh->previousCE = previousCE;
    281     lh->previousContCE = previousContCE;
    282 
    283     return iCE;
    284 }*/
    285 
    286 static
    287 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
    288     uint32_t CE = lh->baseCE;
    289     uint32_t SecondCE = lh->baseContCE;
    290 
    291     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    292     uint32_t nextCE, nextContCE;
    293     int32_t iCE;
    294 
    295     iCE = ucol_inv_findCE(src, CE, SecondCE);
    296 
    297     if(iCE<0) {
    298         return -1;
    299     }
    300 
    301     CE &= strengthMask[strength];
    302     SecondCE &= strengthMask[strength];
    303 
    304     nextCE = CE;
    305     nextContCE = SecondCE;
    306 
    307     while((nextCE  & strengthMask[strength]) == CE
    308         && (nextContCE  & strengthMask[strength]) == SecondCE)
    309     {
    310         nextCE = (*(CETable+3*(++iCE)));
    311         nextContCE = (*(CETable+3*(iCE)+1));
    312     }
    313 
    314     lh->nextCE = nextCE;
    315     lh->nextContCE = nextContCE;
    316 
    317     return iCE;
    318 }
    319 
    320 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
    321     /* reset all the gaps */
    322     int32_t i = 0;
    323     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    324     uint32_t st = 0;
    325     uint32_t t1, t2;
    326     int32_t pos;
    327 
    328     UColToken *tok = lh->first;
    329     uint32_t tokStrength = tok->strength;
    330 
    331     for(i = 0; i<3; i++) {
    332         lh->gapsHi[3*i] = 0;
    333         lh->gapsHi[3*i+1] = 0;
    334         lh->gapsHi[3*i+2] = 0;
    335         lh->gapsLo[3*i] = 0;
    336         lh->gapsLo[3*i+1] = 0;
    337         lh->gapsLo[3*i+2] = 0;
    338         lh->numStr[i] = 0;
    339         lh->fStrToken[i] = NULL;
    340         lh->lStrToken[i] = NULL;
    341         lh->pos[i] = -1;
    342     }
    343 
    344     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
    345 
    346     if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
    347         //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
    348         lh->pos[0] = 0;
    349         t1 = lh->baseCE;
    350         t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
    351         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    352         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    353         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    354         uint32_t primaryCE = (t1 & UCOL_PRIMARYMASK) | ((t2 & UCOL_PRIMARYMASK) >> 16);
    355         primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
    356 
    357         t1 = (primaryCE & UCOL_PRIMARYMASK) | 0x0505;
    358         t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
    359 
    360         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    361         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    362         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    363     } else if(lh->indirect == TRUE && lh->nextCE != 0) {
    364         //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
    365         lh->pos[0] = 0;
    366         t1 = lh->baseCE;
    367         t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
    368         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    369         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    370         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    371         t1 = lh->nextCE;
    372         t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
    373         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    374         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    375         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    376     } else {
    377         for(;;) {
    378             if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
    379                 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
    380                     lh->fStrToken[tokStrength] = tok;
    381                 } else { /* The CE must be implicit, since it's not in the table */
    382                     /* Error */
    383                     *status = U_INTERNAL_PROGRAM_ERROR;
    384                 }
    385             }
    386 
    387             while(tok != NULL && tok->strength >= tokStrength) {
    388                 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
    389                     lh->lStrToken[tokStrength] = tok;
    390                 }
    391                 tok = tok->next;
    392             }
    393             if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
    394                 /* check if previous interval is the same and merge the intervals if it is so */
    395                 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
    396                     lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
    397                     lh->fStrToken[tokStrength+1] = NULL;
    398                     lh->lStrToken[tokStrength+1] = NULL;
    399                     lh->pos[tokStrength+1] = -1;
    400                 }
    401             }
    402             if(tok != NULL) {
    403                 tokStrength = tok->strength;
    404             } else {
    405                 break;
    406             }
    407         }
    408         for(st = 0; st < 3; st++) {
    409             if((pos = lh->pos[st]) >= 0) {
    410                 t1 = *(CETable+3*(pos));
    411                 t2 = *(CETable+3*(pos)+1);
    412                 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    413                 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    414                 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    415                 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
    416                 //pos--;
    417                 //t1 = *(CETable+3*(pos));
    418                 //t2 = *(CETable+3*(pos)+1);
    419                 t1 = lh->baseCE;
    420                 t2 = lh->baseContCE;
    421                 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    422                 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    423                 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
    424             }
    425         }
    426     }
    427 }
    428 
    429 
    430 #define ucol_countBytes(value, noOfBytes)   \
    431 {                               \
    432     uint32_t mask = 0xFFFFFFFF;   \
    433     (noOfBytes) = 0;              \
    434     while(mask != 0) {            \
    435     if(((value) & mask) != 0) { \
    436     (noOfBytes)++;            \
    437     }                           \
    438     mask >>= 8;                 \
    439     }                             \
    440 }
    441 
    442 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
    443     if(U_SUCCESS(*status)) {
    444         g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
    445     }
    446     return g->current;
    447 }
    448 
    449 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
    450     /* TODO: rename to enum names */
    451     uint32_t high, low, count=1;
    452     uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
    453 
    454     if(strength == UCOL_SECONDARY) {
    455         low = UCOL_COMMON_TOP2<<24;
    456         high = 0xFFFFFFFF;
    457         count = 0xFF - UCOL_COMMON_TOP2;
    458     } else {
    459         low = UCOL_BYTE_COMMON << 24; //0x05000000;
    460         high = 0x40000000;
    461         count = 0x40 - UCOL_BYTE_COMMON;
    462     }
    463 
    464     if(tok->next != NULL && tok->next->strength == strength) {
    465         count = tok->next->toInsert;
    466     }
    467 
    468     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
    469     g->current = UCOL_BYTE_COMMON<<24;
    470 
    471     if(g->noOfRanges == 0) {
    472         *status = U_INTERNAL_PROGRAM_ERROR;
    473     }
    474     return g->current;
    475 }
    476 
    477 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
    478     uint32_t strength = tok->strength;
    479     uint32_t low = lows[fStrength*3+strength];
    480     uint32_t high = highs[fStrength*3+strength];
    481     uint32_t maxByte = 0;
    482     if(strength == UCOL_TERTIARY) {
    483         maxByte = 0x3F;
    484     } else if(strength == UCOL_PRIMARY) {
    485         maxByte = 0xFE;
    486     } else {
    487         maxByte = 0xFF;
    488     }
    489 
    490     uint32_t count = tok->toInsert;
    491 
    492     if(low >= high && strength > UCOL_PRIMARY) {
    493         int32_t s = strength;
    494         for(;;) {
    495             s--;
    496             if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
    497                 if(strength == UCOL_SECONDARY) {
    498                     if (low < UCOL_COMMON_TOP2<<24 ) {
    499                        // Override if low range is less than UCOL_COMMON_TOP2.
    500 		        low = UCOL_COMMON_TOP2<<24;
    501                     }
    502                     high = 0xFFFFFFFF;
    503                 } else {
    504                     // Override if low range is less than UCOL_COMMON_BOT3.
    505 		    if ( low < UCOL_COMMON_BOT3<<24 ) {
    506                         low = UCOL_COMMON_BOT3<<24;
    507 		    }
    508                     high = 0x40000000;
    509                 }
    510                 break;
    511             }
    512             if(s<0) {
    513                 *status = U_INTERNAL_PROGRAM_ERROR;
    514                 return 0;
    515             }
    516         }
    517     }
    518 
    519     if(low < 0x02000000) {
    520         // We must not use CE weight byte 02, so we set it as the minimum lower bound.
    521         // See http://site.icu-project.org/design/collation/bytes
    522         low = 0x02000000;
    523     }
    524 
    525     if(strength == UCOL_SECONDARY) { /* similar as simple */
    526         if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
    527             low = UCOL_COMMON_TOP2<<24;
    528         }
    529         if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
    530             high = UCOL_COMMON_TOP2<<24;
    531         }
    532         if(low < (UCOL_COMMON_BOT2<<24)) {
    533             g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
    534             g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
    535             //g->current = UCOL_COMMON_BOT2<<24;
    536             return g->current;
    537         }
    538     }
    539 
    540     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
    541     if(g->noOfRanges == 0) {
    542         *status = U_INTERNAL_PROGRAM_ERROR;
    543     }
    544     g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
    545     return g->current;
    546 }
    547 
    548 static
    549 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
    550     uint32_t i = 0;
    551     UChar c;
    552 
    553     if(U_FAILURE(*status)) {
    554         return 0;
    555     }
    556 
    557     if(sourceLen > resLen) {
    558         *status = U_MEMORY_ALLOCATION_ERROR;
    559         return 0;
    560     }
    561 
    562     for(i = 0; i < sourceLen; i++) {
    563         c = source[i];
    564         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
    565             switch(c - 0x3000) {
    566             case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
    567             case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
    568                 c++;
    569                 break;
    570             case 0xF5:
    571                 c = 0x30AB;
    572                 break;
    573             case 0xF6:
    574                 c = 0x30B1;
    575                 break;
    576             }
    577         }
    578         resBuf[i] = c;
    579     }
    580     return sourceLen;
    581 }
    582 
    583 static
    584 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
    585     uint32_t i = 0;
    586     UChar c;
    587 
    588     if(U_FAILURE(*status)) {
    589         return 0;
    590     }
    591 
    592     if(sourceLen > resLen) {
    593         *status = U_MEMORY_ALLOCATION_ERROR;
    594         return 0;
    595     }
    596 
    597     for(i = 0; i < sourceLen; i++) {
    598         c = source[i];
    599         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
    600             switch(c - 0x3000) {
    601             case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
    602             case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
    603                 c--;
    604                 break;
    605             case 0xAB:
    606                 c = 0x30F5;
    607                 break;
    608             case 0xB1:
    609                 c = 0x30F6;
    610                 break;
    611             }
    612         }
    613         resBuf[i] = c;
    614     }
    615     return sourceLen;
    616 }
    617 
    618 U_NAMESPACE_BEGIN
    619 
    620 static
    621 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
    622     uint32_t i = 0;
    623     UChar n[128];
    624     uint32_t nLen = 0;
    625     uint32_t uCount = 0, lCount = 0;
    626 
    627     collIterate s;
    628     uint32_t order = 0;
    629 
    630     if(U_FAILURE(*status)) {
    631         return UCOL_LOWER_CASE;
    632     }
    633 
    634     nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
    635     if(U_SUCCESS(*status)) {
    636         for(i = 0; i < nLen; i++) {
    637             uprv_init_collIterate(UCA, &n[i], 1, &s, status);
    638             order = ucol_getNextCE(UCA, &s, status);
    639             if(isContinuation(order)) {
    640                 *status = U_INTERNAL_PROGRAM_ERROR;
    641                 return UCOL_LOWER_CASE;
    642             }
    643             if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
    644                 uCount++;
    645             } else {
    646                 if(u_islower(n[i])) {
    647                     lCount++;
    648                 } else if(U_SUCCESS(*status)) {
    649                     UChar sk[1], lk[1];
    650                     u_toSmallKana(&n[i], 1, sk, 1, status);
    651                     u_toLargeKana(&n[i], 1, lk, 1, status);
    652                     if(sk[0] == n[i] && lk[0] != n[i]) {
    653                         lCount++;
    654                     }
    655                 }
    656             }
    657         }
    658     }
    659 
    660     if(uCount != 0 && lCount != 0) {
    661         return UCOL_MIXED_CASE;
    662     } else if(uCount != 0) {
    663         return UCOL_UPPER_CASE;
    664     } else {
    665         return UCOL_LOWER_CASE;
    666     }
    667 }
    668 
    669 
    670 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
    671     /* this one makes the table and stuff */
    672     uint32_t noOfBytes[3];
    673     uint32_t i;
    674 
    675     for(i = 0; i<3; i++) {
    676         ucol_countBytes(CEparts[i], noOfBytes[i]);
    677     }
    678 
    679     /* Here we have to pack CEs from parts */
    680 
    681     uint32_t CEi = 0;
    682     uint32_t value = 0;
    683 
    684     while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
    685         if(CEi > 0) {
    686             value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
    687         } else {
    688             value = 0;
    689         }
    690 
    691         if(2*CEi<noOfBytes[0]) {
    692             value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
    693         }
    694         if(CEi<noOfBytes[1]) {
    695             value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
    696         }
    697         if(CEi<noOfBytes[2]) {
    698             value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
    699         }
    700         tok->CEs[CEi] = value;
    701         CEi++;
    702     }
    703     if(CEi == 0) { /* totally ignorable */
    704         tok->noOfCEs = 1;
    705         tok->CEs[0] = 0;
    706     } else { /* there is at least something */
    707         tok->noOfCEs = CEi;
    708     }
    709 
    710 
    711     // we want to set case bits here and now, not later.
    712     // Case bits handling
    713     if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
    714         tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
    715         int32_t cSize = (tok->source & 0xFF000000) >> 24;
    716         UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
    717 
    718         if(cSize > 1) {
    719             // Do it manually
    720             tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
    721         } else {
    722             // Copy it from the UCA
    723             uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
    724             tok->CEs[0] |= (caseCE & 0xC0);
    725         }
    726     }
    727 
    728 #if UCOL_DEBUG==2
    729     fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
    730     for(i = 0; i<tok->noOfCEs; i++) {
    731         fprintf(stderr, "%08X ", tok->CEs[i]);
    732     }
    733     fprintf(stderr, "\n");
    734 #endif
    735 }
    736 
    737 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
    738     ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
    739     uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
    740 
    741     UColToken *tok = lh->last;
    742     uint32_t t[UCOL_STRENGTH_LIMIT];
    743 
    744     uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
    745 
    746     /* must initialize ranges to avoid memory check warnings */
    747     for (int i = 0; i < UCOL_CE_STRENGTH_LIMIT; i++) {
    748         uprv_memset(Gens[i].ranges, 0, sizeof(Gens[i].ranges));
    749     }
    750 
    751     tok->toInsert = 1;
    752     t[tok->strength] = 1;
    753 
    754     while(tok->previous != NULL) {
    755         if(tok->previous->strength < tok->strength) { /* going up */
    756             t[tok->strength] = 0;
    757             t[tok->previous->strength]++;
    758         } else if(tok->previous->strength > tok->strength) { /* going down */
    759             t[tok->previous->strength] = 1;
    760         } else {
    761             t[tok->strength]++;
    762         }
    763         tok=tok->previous;
    764         tok->toInsert = t[tok->strength];
    765     }
    766 
    767     tok->toInsert = t[tok->strength];
    768     ucol_inv_getGapPositions(src, lh, status);
    769 
    770 #if UCOL_DEBUG
    771     fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
    772     int32_t j = 2;
    773     for(j = 2; j >= 0; j--) {
    774         fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
    775         fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
    776     }
    777     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
    778 
    779     do {
    780         fprintf(stderr,"%i", tok->strength);
    781         tok = tok->next;
    782     } while(tok != NULL);
    783     fprintf(stderr, "\n");
    784 
    785     tok=&lh->first[UCOL_TOK_POLARITY_POSITIVE];
    786 
    787     do {
    788         fprintf(stderr,"%i", tok->toInsert);
    789         tok = tok->next;
    790     } while(tok != NULL);
    791 #endif
    792 
    793     tok = lh->first;
    794     uint32_t fStrength = UCOL_IDENTICAL;
    795     uint32_t initStrength = UCOL_IDENTICAL;
    796 
    797 
    798     CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
    799     CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
    800     CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
    801 
    802     while (tok != NULL && U_SUCCESS(*status)) {
    803         fStrength = tok->strength;
    804         if(fStrength < initStrength) {
    805             initStrength = fStrength;
    806             if(lh->pos[fStrength] == -1) {
    807                 while(lh->pos[fStrength] == -1 && fStrength > 0) {
    808                     fStrength--;
    809                 }
    810                 if(lh->pos[fStrength] == -1) {
    811                     *status = U_INTERNAL_PROGRAM_ERROR;
    812                     return;
    813                 }
    814             }
    815             if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
    816                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
    817                 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
    818                 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
    819                 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
    820             } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
    821                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
    822                 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
    823                 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
    824                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    825             } else { /* primaries */
    826                 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
    827                 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
    828                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
    829                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    830             }
    831         } else {
    832             if(tok->strength == UCOL_TERTIARY) {
    833                 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
    834             } else if(tok->strength == UCOL_SECONDARY) {
    835                 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
    836                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    837             } else if(tok->strength == UCOL_PRIMARY) {
    838                 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
    839                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
    840                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    841             }
    842         }
    843         ucol_doCE(src, CEparts, tok, status);
    844         tok = tok->next;
    845     }
    846 }
    847 
    848 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
    849     UCAElements el;
    850     UColToken *tok = lh->first;
    851     UColToken *expt = NULL;
    852     uint32_t i = 0, j = 0;
    853     const Normalizer2Impl *nfcImpl = Normalizer2Factory::getNFCImpl(*status);
    854 
    855     while(tok != NULL && U_SUCCESS(*status)) {
    856         /* first, check if there are any expansions */
    857         /* if there are expansions, we need to do a little bit more processing */
    858         /* since parts of expansion can be tailored, while others are not */
    859         if(tok->expansion != 0) {
    860             uint32_t len = tok->expansion >> 24;
    861             uint32_t currentSequenceLen = len;
    862             uint32_t expOffset = tok->expansion & 0x00FFFFFF;
    863             //uint32_t exp = currentSequenceLen | expOffset;
    864             UColToken exp;
    865             exp.source = currentSequenceLen | expOffset;
    866             exp.rulesToParseHdl = &(src->source);
    867 
    868             while(len > 0) {
    869                 currentSequenceLen = len;
    870                 while(currentSequenceLen > 0) {
    871                     exp.source = (currentSequenceLen << 24) | expOffset;
    872                     if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
    873                         uint32_t noOfCEsToCopy = expt->noOfCEs;
    874                         for(j = 0; j<noOfCEsToCopy; j++) {
    875                             tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
    876                         }
    877                         tok->noOfExpCEs += noOfCEsToCopy;
    878                         // Smart people never try to add codepoints and CEs.
    879                         // For some odd reason, it won't work.
    880                         expOffset += currentSequenceLen; //noOfCEsToCopy;
    881                         len -= currentSequenceLen; //noOfCEsToCopy;
    882                         break;
    883                     } else {
    884                         currentSequenceLen--;
    885                     }
    886                 }
    887                 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
    888                     /* will have to get one from UCA */
    889                     /* first, get the UChars from the rules */
    890                     /* then pick CEs out until there is no more and stuff them into expansion */
    891                     collIterate s;
    892                     uint32_t order = 0;
    893                     uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s, status);
    894 
    895                     for(;;) {
    896                         order = ucol_getNextCE(src->UCA, &s, status);
    897                         if(order == UCOL_NO_MORE_CES) {
    898                             break;
    899                         }
    900                         tok->expCEs[tok->noOfExpCEs++] = order;
    901                     }
    902                     expOffset++;
    903                     len--;
    904                 }
    905             }
    906         } else {
    907             tok->noOfExpCEs = 0;
    908         }
    909 
    910         /* set the ucaelement with obtained values */
    911         el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
    912         /* copy CEs */
    913         for(i = 0; i<tok->noOfCEs; i++) {
    914             el.CEs[i] = tok->CEs[i];
    915         }
    916         for(i = 0; i<tok->noOfExpCEs; i++) {
    917             el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
    918         }
    919 
    920         /* copy UChars */
    921         // We kept prefix and source kind of together, as it is a kind of a contraction.
    922         // However, now we have to slice the prefix off the main thing -
    923         el.prefix = el.prefixChars;
    924         el.cPoints = el.uchars;
    925         if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
    926             // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
    927             // decomposed elements to the unsaf table.
    928             el.prefixSize = tok->prefix>>24;
    929             uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
    930 
    931             el.cSize = (tok->source >> 24)-(tok->prefix>>24);
    932             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
    933         } else {
    934             el.prefixSize = 0;
    935             *el.prefix = 0;
    936 
    937             el.cSize = (tok->source >> 24);
    938             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
    939         }
    940         if(src->UCA != NULL) {
    941             for(i = 0; i<el.cSize; i++) {
    942                 if(UCOL_ISJAMO(el.cPoints[i])) {
    943                     t->image->jamoSpecial = TRUE;
    944                 }
    945             }
    946             if (!src->buildCCTabFlag && el.cSize > 0) {
    947                 // Check the trailing canonical combining class (tccc) of the last character.
    948                 const UChar *s = el.cPoints + el.cSize;
    949                 uint16_t fcd = nfcImpl->previousFCD16(el.cPoints, s);
    950                 if ((fcd & 0xff) != 0) {
    951                     src->buildCCTabFlag = TRUE;
    952                 }
    953             }
    954         }
    955 
    956         /* and then, add it */
    957 #if UCOL_DEBUG==2
    958         fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
    959 #endif
    960         uprv_uca_addAnElement(t, &el, status);
    961 
    962 #if UCOL_DEBUG_DUPLICATES
    963         if(*status != U_ZERO_ERROR) {
    964             fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
    965             *status = U_ZERO_ERROR;
    966         }
    967 #endif
    968 
    969         tok = tok->next;
    970     }
    971 }
    972 
    973 U_CDECL_BEGIN
    974 static UBool U_CALLCONV
    975 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
    976     UErrorCode status = U_ZERO_ERROR;
    977     tempUCATable *t = (tempUCATable *)context;
    978     if(value == 0) {
    979         while(start < limit) {
    980             uint32_t CE = utrie_get32(t->mapping, start, NULL);
    981             if(CE == UCOL_NOT_FOUND) {
    982                 UCAElements el;
    983                 el.isThai = FALSE;
    984                 el.prefixSize = 0;
    985                 el.prefixChars[0] = 0;
    986                 el.prefix = el.prefixChars;
    987                 el.cPoints = el.uchars;
    988 
    989                 el.cSize = 0;
    990                 U16_APPEND_UNSAFE(el.uchars, el.cSize, start);
    991 
    992                 el.noOfCEs = 1;
    993                 el.CEs[0] = 0;
    994                 uprv_uca_addAnElement(t, &el, &status);
    995 
    996             }
    997             start++;
    998         }
    999     }
   1000     if(U_FAILURE(status)) {
   1001         return FALSE;
   1002     } else {
   1003         return TRUE;
   1004     }
   1005 }
   1006 U_CDECL_END
   1007 
   1008 static void
   1009 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
   1010                                UChar32 start, UChar32 end,
   1011                                UErrorCode *status)
   1012 {
   1013     //UChar decomp[256];
   1014     uint32_t CE = UCOL_NOT_FOUND;
   1015     UChar32 u = 0;
   1016     UCAElements el;
   1017     el.isThai = FALSE;
   1018     el.prefixSize = 0;
   1019     el.prefixChars[0] = 0;
   1020     collIterate colIt;
   1021 
   1022     if(U_SUCCESS(*status)) {
   1023         for(u = start; u<=end; u++) {
   1024             if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
   1025                 /* this test is for contractions that are missing the starting element. */
   1026                 || ((isCntTableElement(CE)) &&
   1027                 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
   1028                 )
   1029             {
   1030                 el.cSize = 0;
   1031                 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
   1032                 //decomp[0] = (UChar)u;
   1033                 //el.uchars[0] = (UChar)u;
   1034                 el.cPoints = el.uchars;
   1035                 //el.cSize = 1;
   1036                 el.noOfCEs = 0;
   1037                 el.prefix = el.prefixChars;
   1038                 el.prefixSize = 0;
   1039                 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
   1040                 // We actually want to check whether this element is a special
   1041                 // If it is an implicit element (hangul, CJK - we want to copy the
   1042                 // special, not the resolved CEs) - for hangul, copying resolved
   1043                 // would just make things the same (there is an expansion and it
   1044                 // takes approximately the same amount of time to resolve as
   1045                 // falling back to the UCA).
   1046                 /*
   1047                 UTRIE_GET32(src->UCA->mapping, u, CE);
   1048                 tag = getCETag(CE);
   1049                 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
   1050                 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
   1051                 || tag == LEAD_SURROGATE_TAG) {
   1052                 el.CEs[el.noOfCEs++] = CE;
   1053                 } else {
   1054                 */
   1055                 // It turns out that it does not make sense to keep implicits
   1056                 // unresolved. The cost of resolving them is big enough so that
   1057                 // it doesn't make any difference whether we have to go to the UCA
   1058                 // or not.
   1059                 {
   1060                     uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt, status);
   1061                     while(CE != UCOL_NO_MORE_CES) {
   1062                         CE = ucol_getNextCE(src->UCA, &colIt, status);
   1063                         if(CE != UCOL_NO_MORE_CES) {
   1064                             el.CEs[el.noOfCEs++] = CE;
   1065                         }
   1066                     }
   1067                 }
   1068                 uprv_uca_addAnElement(t, &el, status);
   1069             }
   1070         }
   1071     }
   1072 }
   1073 
   1074 U_NAMESPACE_END
   1075 
   1076 U_CFUNC UCATableHeader *
   1077 ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
   1078     U_NAMESPACE_USE
   1079 
   1080     uint32_t i = 0;
   1081     if(U_FAILURE(*status)) {
   1082         return NULL;
   1083     }
   1084     /*
   1085     2.  Eliminate the negative lists by doing the following for each non-null negative list:
   1086     o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
   1087     create new ListHeader X
   1088     o   reverse the list, add to the end of X's positive list. Reset the strength of the
   1089     first item you add, based on the stronger strength levels of the two lists.
   1090     */
   1091     /*
   1092     3.  For each ListHeader with a non-null positive list:
   1093     */
   1094     /*
   1095     o   Find all character strings with CEs between the baseCE and the
   1096     next/previous CE, at the strength of the first token. Add these to the
   1097     tailoring.
   1098     ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
   1099     tailoring has & x < z...
   1100     ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
   1101     */
   1102     /* It is possible that this part should be done even while constructing list */
   1103     /* The problem is that it is unknown what is going to be the strongest weight */
   1104     /* So we might as well do it here */
   1105 
   1106     /*
   1107     o   Allocate CEs for each token in the list, based on the total number N of the
   1108     largest level difference, and the gap G between baseCE and nextCE at that
   1109     level. The relation * between the last item and nextCE is the same as the
   1110     strongest strength.
   1111     o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
   1112     ? There are 3 primary items: a, d, e. Fit them into the primary gap.
   1113     Then fit b and c into the secondary gap between a and d, then fit q
   1114     into the tertiary gap between b and c.
   1115 
   1116     o   Example: baseCE << b <<< q << c * nextCE(X,2)
   1117     ? There are 2 secondary items: b, c. Fit them into the secondary gap.
   1118     Then fit q into the tertiary gap between b and c.
   1119     o   When incrementing primary values, we will not cross high byte
   1120     boundaries except where there is only a single-byte primary. That is to
   1121     ensure that the script reordering will continue to work.
   1122     */
   1123     UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
   1124     /* test for NULL */
   1125     if (image == NULL) {
   1126         *status = U_MEMORY_ALLOCATION_ERROR;
   1127         return NULL;
   1128     }
   1129     uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
   1130 
   1131     for(i = 0; i<src->resultLen; i++) {
   1132         /* now we need to generate the CEs */
   1133         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
   1134         /* According to strength                                                          */
   1135         if(U_SUCCESS(*status)) {
   1136             if(src->lh[i].first) { // if there are any elements
   1137                 // due to the way parser works, subsequent tailorings
   1138                 // may remove all the elements from a sequence, therefore
   1139                 // leaving an empty tailoring sequence.
   1140                 ucol_initBuffers(src, &src->lh[i], status);
   1141             }
   1142         }
   1143         if(U_FAILURE(*status)) {
   1144             uprv_free(image);
   1145             return NULL;
   1146         }
   1147     }
   1148 
   1149     if(src->varTop != NULL) { /* stuff the variable top value */
   1150         src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
   1151         /* remove it from the list */
   1152         if(src->varTop->listHeader->first == src->varTop) { /* first in list */
   1153             src->varTop->listHeader->first = src->varTop->next;
   1154         }
   1155         if(src->varTop->listHeader->last == src->varTop) { /* first in list */
   1156             src->varTop->listHeader->last = src->varTop->previous;
   1157         }
   1158         if(src->varTop->next != NULL) {
   1159             src->varTop->next->previous = src->varTop->previous;
   1160         }
   1161         if(src->varTop->previous != NULL) {
   1162             src->varTop->previous->next = src->varTop->next;
   1163         }
   1164     }
   1165 
   1166 
   1167     tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
   1168     if(U_FAILURE(*status)) {
   1169         uprv_free(image);
   1170         return NULL;
   1171     }
   1172 
   1173 
   1174     /* After this, we have assigned CE values to all regular CEs      */
   1175     /* now we will go through list once more and resolve expansions,  */
   1176     /* make UCAElements structs and add them to table                 */
   1177     for(i = 0; i<src->resultLen; i++) {
   1178         /* now we need to generate the CEs */
   1179         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
   1180         /* According to strength                                                          */
   1181         if(U_SUCCESS(*status)) {
   1182             ucol_createElements(src, t, &src->lh[i], status);
   1183         }
   1184     }
   1185 
   1186     UCAElements el;
   1187     el.isThai = FALSE;
   1188     el.prefixSize = 0;
   1189     el.prefixChars[0] = 0;
   1190 
   1191     /* add latin-1 stuff */
   1192     ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
   1193 
   1194     /* add stuff for copying */
   1195     if(src->copySet != NULL) {
   1196         int32_t i = 0;
   1197         UnicodeSet *set = (UnicodeSet *)src->copySet;
   1198         for(i = 0; i < set->getRangeCount(); i++) {
   1199             ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
   1200         }
   1201     }
   1202 
   1203     if(U_SUCCESS(*status)) {
   1204         /* copy contractions from the UCA - this is felt mostly for cyrillic*/
   1205 
   1206         uint32_t tailoredCE = UCOL_NOT_FOUND;
   1207         UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
   1208         int32_t maxUCAContractionLength = src->UCA->image->contractionUCACombosWidth;
   1209         UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
   1210         // Check for null pointer
   1211         if (ucaEl == NULL) {
   1212             *status = U_MEMORY_ALLOCATION_ERROR;
   1213             return NULL;
   1214         }
   1215         while(*conts != 0) {
   1216             // A continuation is NUL-terminated and NUL-padded
   1217             // except if it has the maximum length.
   1218             int32_t contractionLength = maxUCAContractionLength;
   1219             while(contractionLength > 0 && conts[contractionLength - 1] == 0) {
   1220                 --contractionLength;
   1221             }
   1222             UChar32 first;
   1223             int32_t firstLength = 0;
   1224             U16_NEXT(conts, firstLength, contractionLength, first);
   1225             tailoredCE = utrie_get32(t->mapping, first, NULL);
   1226             if(tailoredCE != UCOL_NOT_FOUND) {
   1227                 UBool needToAdd = TRUE;
   1228                 if(isCntTableElement(tailoredCE)) {
   1229                     if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+firstLength, status) == TRUE) {
   1230                         needToAdd = FALSE;
   1231                     }
   1232                 }
   1233                 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
   1234                     UCAElements elm;
   1235                     elm.cPoints = el.uchars;
   1236                     elm.noOfCEs = 0;
   1237                     elm.uchars[0] = *conts;
   1238                     elm.uchars[1] = 0;
   1239                     elm.cSize = 1;
   1240                     elm.prefixChars[0] = *(conts+2);
   1241                     elm.isThai = FALSE;
   1242                     elm.prefix = elm.prefixChars;
   1243                     elm.prefixSize = 1;
   1244                     UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
   1245                     if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
   1246                         needToAdd = TRUE;
   1247                     }
   1248                 }
   1249                 if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
   1250                     needToAdd = FALSE;
   1251                 }
   1252 
   1253                 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
   1254                     if (*(conts+1) != 0) {  // contractions
   1255                         el.prefix = el.prefixChars;
   1256                         el.prefixSize = 0;
   1257                         el.cPoints = el.uchars;
   1258                         el.noOfCEs = 0;
   1259                         u_memcpy(el.uchars, conts, contractionLength);
   1260                         el.cSize = contractionLength;
   1261                         ucol_setText(ucaEl, el.uchars, el.cSize, status);
   1262                     }
   1263                     else { // pre-context character
   1264                         UChar str[4] = { 0 };
   1265                         int32_t len=0;
   1266                         int32_t preKeyLen=0;
   1267 
   1268                         el.cPoints = el.uchars;
   1269                         el.noOfCEs = 0;
   1270                         el.uchars[0] = *conts;
   1271                         el.uchars[1] = 0;
   1272                         el.cSize = 1;
   1273                         el.prefixChars[0] = *(conts+2);
   1274                         el.prefix = el.prefixChars;
   1275                         el.prefixSize = 1;
   1276                         if (el.prefixChars[0]!=0) {
   1277                             // get CE of prefix character first
   1278                             str[0]=el.prefixChars[0];
   1279                             str[1]=0;
   1280                             ucol_setText(ucaEl, str, 1, status);
   1281                             while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
   1282                                     != UCOL_NULLORDER) {
   1283                                 preKeyLen++;  // count number of keys for prefix character
   1284                             }
   1285                             str[len++] = el.prefixChars[0];
   1286                         }
   1287 
   1288                         str[len++] = el.uchars[0];
   1289                         str[len]=0;
   1290                         ucol_setText(ucaEl, str, len, status);
   1291                         // Skip the keys for prefix character, then copy the rest to el.
   1292                         while ((preKeyLen-->0) &&
   1293                                (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
   1294                             continue;
   1295                         }
   1296 
   1297                     }
   1298                     while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
   1299                         el.noOfCEs++;
   1300                     }
   1301                     uprv_uca_addAnElement(t, &el, status);
   1302                 }
   1303 
   1304             } else if(src->removeSet != NULL && uset_contains(src->removeSet, first)) {
   1305                 ucol_uprv_bld_copyRangeFromUCA(src, t, first, first, status);
   1306             }
   1307             conts+=maxUCAContractionLength;
   1308         }
   1309         ucol_closeElements(ucaEl);
   1310     }
   1311 
   1312     // Add completely ignorable elements
   1313     utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
   1314 
   1315     // add tailoring characters related canonical closures
   1316     uprv_uca_canonicalClosure(t, src, NULL, status);
   1317 
   1318     /* still need to produce compatibility closure */
   1319 
   1320     UCATableHeader *myData = uprv_uca_assembleTable(t, status);
   1321 
   1322     uprv_uca_closeTempTable(t);
   1323     uprv_free(image);
   1324 
   1325     return myData;
   1326 }
   1327 
   1328 U_CDECL_BEGIN
   1329 static UBool U_CALLCONV
   1330 ucol_bld_cleanup(void)
   1331 {
   1332     udata_close(invUCA_DATA_MEM);
   1333     invUCA_DATA_MEM = NULL;
   1334     _staticInvUCA = NULL;
   1335     gStaticInvUCAInitOnce.reset();
   1336     return TRUE;
   1337 }
   1338 U_CDECL_END
   1339 
   1340 static void U_CALLCONV initInverseUCA(UErrorCode &status) {
   1341     U_ASSERT(invUCA_DATA_MEM == NULL);
   1342     U_ASSERT(_staticInvUCA == NULL);
   1343     ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
   1344     InverseUCATableHeader *newInvUCA = NULL;
   1345     UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, &status);
   1346 
   1347     if(U_FAILURE(status)) {
   1348         if (result) {
   1349             udata_close(result);
   1350         }
   1351         // This is not needed, as we are talking about
   1352         // memory we got from UData
   1353         //uprv_free(newInvUCA);
   1354         return;
   1355     }
   1356 
   1357     if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
   1358         newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
   1359         UCollator *UCA = ucol_initUCA(&status);
   1360         // UCA versions of UCA and inverse UCA should match
   1361         if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
   1362             status = U_INVALID_FORMAT_ERROR;
   1363             udata_close(result);
   1364             return;
   1365         }
   1366 
   1367         invUCA_DATA_MEM = result;
   1368         _staticInvUCA = newInvUCA;
   1369     }
   1370 }
   1371 
   1372 
   1373 U_CAPI const InverseUCATableHeader * U_EXPORT2
   1374 ucol_initInverseUCA(UErrorCode *status)
   1375 {
   1376     umtx_initOnce(gStaticInvUCAInitOnce, &initInverseUCA, *status);
   1377     return _staticInvUCA;
   1378 }
   1379 
   1380 /* This is the data that is used for non-script reordering codes. These _must_ be kept
   1381  * in order that they are to be applied as defaults and in synch with the UColReorderCode enum.
   1382  */
   1383 static const char * const ReorderingTokenNames[] = {
   1384     "SPACE",
   1385     "PUNCT",
   1386     "SYMBOL",
   1387     "CURRENCY",
   1388     "DIGIT"
   1389 };
   1390 
   1391 static void toUpper(const char* src, char* dst, uint32_t length) {
   1392    for (uint32_t i = 0; *src != '\0' && i < length - 1; ++src, ++dst, ++i) {
   1393        *dst = uprv_toupper(*src);
   1394    }
   1395    *dst = '\0';
   1396 }
   1397 
   1398 U_INTERNAL int32_t U_EXPORT2
   1399 ucol_findReorderingEntry(const char* name) {
   1400     char buffer[32];
   1401     toUpper(name, buffer, 32);
   1402     for (uint32_t entry = 0; entry < LENGTHOF(ReorderingTokenNames); entry++) {
   1403         if (uprv_strcmp(buffer, ReorderingTokenNames[entry]) == 0) {
   1404             return entry + UCOL_REORDER_CODE_FIRST;
   1405         }
   1406     }
   1407     return USCRIPT_INVALID_CODE;
   1408 }
   1409 
   1410 #endif /* #if !UCONFIG_NO_COLLATION */
   1411