Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 2001-2008, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucol_bld.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created 02/22/2001
     14 *   created by: Vladimir Weinstein
     15 *
     16 * This module builds a collator based on the rule set.
     17 *
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 
     22 #if !UCONFIG_NO_COLLATION
     23 
     24 #include "unicode/ucoleitr.h"
     25 #include "unicode/udata.h"
     26 #include "unicode/uchar.h"
     27 #include "unicode/uniset.h"
     28 #include "ucol_bld.h"
     29 #include "ucol_elm.h"
     30 #include "ucol_cnt.h"
     31 #include "ucln_in.h"
     32 #include "umutex.h"
     33 #include "unormimp.h"
     34 #include "cmemory.h"
     35 
     36 static const InverseUCATableHeader* _staticInvUCA = NULL;
     37 static UDataMemory* invUCA_DATA_MEM = NULL;
     38 
     39 U_CDECL_BEGIN
     40 static UBool U_CALLCONV
     41 isAcceptableInvUCA(void * /*context*/,
     42                    const char * /*type*/, const char * /*name*/,
     43                    const UDataInfo *pInfo)
     44 {
     45     /* context, type & name are intentionally not used */
     46     if( pInfo->size>=20 &&
     47         pInfo->isBigEndian==U_IS_BIG_ENDIAN &&
     48         pInfo->charsetFamily==U_CHARSET_FAMILY &&
     49         pInfo->dataFormat[0]==INVUCA_DATA_FORMAT_0 &&   /* dataFormat="InvC" */
     50         pInfo->dataFormat[1]==INVUCA_DATA_FORMAT_1 &&
     51         pInfo->dataFormat[2]==INVUCA_DATA_FORMAT_2 &&
     52         pInfo->dataFormat[3]==INVUCA_DATA_FORMAT_3 &&
     53         pInfo->formatVersion[0]==INVUCA_FORMAT_VERSION_0 &&
     54         pInfo->formatVersion[1]>=INVUCA_FORMAT_VERSION_1 //&&
     55         //pInfo->formatVersion[1]==INVUCA_FORMAT_VERSION_1 &&
     56         //pInfo->formatVersion[2]==INVUCA_FORMAT_VERSION_2 &&
     57         //pInfo->formatVersion[3]==INVUCA_FORMAT_VERSION_3 &&
     58         )
     59     {
     60         UVersionInfo UCDVersion;
     61         u_getUnicodeVersion(UCDVersion);
     62         return (pInfo->dataVersion[0]==UCDVersion[0] &&
     63             pInfo->dataVersion[1]==UCDVersion[1]);
     64             //pInfo->dataVersion[1]==invUcaDataInfo.dataVersion[1] &&
     65             //pInfo->dataVersion[2]==invUcaDataInfo.dataVersion[2] &&
     66             //pInfo->dataVersion[3]==invUcaDataInfo.dataVersion[3]) {
     67     } else {
     68         return FALSE;
     69     }
     70 }
     71 U_CDECL_END
     72 
     73 /*
     74 * Takes two CEs (lead and continuation) and
     75 * compares them as CEs should be compared:
     76 * primary vs. primary, secondary vs. secondary
     77 * tertiary vs. tertiary
     78 */
     79 static int32_t compareCEs(uint32_t source0, uint32_t source1, uint32_t target0, uint32_t target1) {
     80     uint32_t s1 = source0, s2, t1 = target0, t2;
     81     if(isContinuation(source1)) {
     82         s2 = source1;
     83     } else {
     84         s2 = 0;
     85     }
     86     if(isContinuation(target1)) {
     87         t2 = target1;
     88     } else {
     89         t2 = 0;
     90     }
     91 
     92     uint32_t s = 0, t = 0;
     93     if(s1 == t1 && s2 == t2) {
     94         return 0;
     95     }
     96     s = (s1 & 0xFFFF0000)|((s2 & 0xFFFF0000)>>16);
     97     t = (t1 & 0xFFFF0000)|((t2 & 0xFFFF0000)>>16);
     98     if(s < t) {
     99         return -1;
    100     } else if(s > t) {
    101         return 1;
    102     } else {
    103         s = (s1 & 0x0000FF00) | (s2 & 0x0000FF00)>>8;
    104         t = (t1 & 0x0000FF00) | (t2 & 0x0000FF00)>>8;
    105         if(s < t) {
    106             return -1;
    107         } else if(s > t) {
    108             return 1;
    109         } else {
    110             s = (s1 & 0x000000FF)<<8 | (s2 & 0x000000FF);
    111             t = (t1 & 0x000000FF)<<8 | (t2 & 0x000000FF);
    112             if(s < t) {
    113                 return -1;
    114             } else {
    115                 return 1;
    116             }
    117         }
    118     }
    119 }
    120 
    121 static
    122 int32_t ucol_inv_findCE(const UColTokenParser *src, uint32_t CE, uint32_t SecondCE) {
    123     uint32_t bottom = 0, top = src->invUCA->tableSize;
    124     uint32_t i = 0;
    125     uint32_t first = 0, second = 0;
    126     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    127     int32_t res = 0;
    128 
    129     while(bottom < top-1) {
    130         i = (top+bottom)/2;
    131         first = *(CETable+3*i);
    132         second = *(CETable+3*i+1);
    133         res = compareCEs(first, second, CE, SecondCE);
    134         if(res > 0) {
    135             top = i;
    136         } else if(res < 0) {
    137             bottom = i;
    138         } else {
    139             break;
    140         }
    141     }
    142 
    143     /* weiv:                                                  */
    144     /* in searching for elements, I have removed the failure  */
    145     /* The reason for this is that the builder does not rely  */
    146     /* on search mechanism telling it that it didn't find an  */
    147     /* element. However, indirect positioning relies on being */
    148     /* able to find the elements around any CE, even if it is */
    149     /* not defined in the UCA. */
    150     return i;
    151     /*
    152     if((first == CE && second == SecondCE)) {
    153     return i;
    154     } else {
    155     return -1;
    156     }
    157     */
    158 }
    159 
    160 static const uint32_t strengthMask[UCOL_CE_STRENGTH_LIMIT] = {
    161     0xFFFF0000,
    162     0xFFFFFF00,
    163     0xFFFFFFFF
    164 };
    165 
    166 U_CAPI int32_t U_EXPORT2 ucol_inv_getNextCE(const UColTokenParser *src,
    167                                             uint32_t CE, uint32_t contCE,
    168                                             uint32_t *nextCE, uint32_t *nextContCE,
    169                                             uint32_t strength)
    170 {
    171     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    172     int32_t iCE;
    173 
    174     iCE = ucol_inv_findCE(src, CE, contCE);
    175 
    176     if(iCE<0) {
    177         *nextCE = UCOL_NOT_FOUND;
    178         return -1;
    179     }
    180 
    181     CE &= strengthMask[strength];
    182     contCE &= strengthMask[strength];
    183 
    184     *nextCE = CE;
    185     *nextContCE = contCE;
    186 
    187     while((*nextCE  & strengthMask[strength]) == CE
    188         && (*nextContCE  & strengthMask[strength]) == contCE)
    189     {
    190         *nextCE = (*(CETable+3*(++iCE)));
    191         *nextContCE = (*(CETable+3*(iCE)+1));
    192     }
    193 
    194     return iCE;
    195 }
    196 
    197 U_CFUNC int32_t U_EXPORT2 ucol_inv_getPrevCE(const UColTokenParser *src,
    198                                             uint32_t CE, uint32_t contCE,
    199                                             uint32_t *prevCE, uint32_t *prevContCE,
    200                                             uint32_t strength)
    201 {
    202     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    203     int32_t iCE;
    204 
    205     iCE = ucol_inv_findCE(src, CE, contCE);
    206 
    207     if(iCE<0) {
    208         *prevCE = UCOL_NOT_FOUND;
    209         return -1;
    210     }
    211 
    212     CE &= strengthMask[strength];
    213     contCE &= strengthMask[strength];
    214 
    215     *prevCE = CE;
    216     *prevContCE = contCE;
    217 
    218     while((*prevCE  & strengthMask[strength]) == CE
    219         && (*prevContCE  & strengthMask[strength])== contCE
    220         && iCE > 0) /* this condition should prevent falling off the edge of the world */
    221     {
    222         /* here, we end up in a singularity - zero */
    223         *prevCE = (*(CETable+3*(--iCE)));
    224         *prevContCE = (*(CETable+3*(iCE)+1));
    225     }
    226 
    227     return iCE;
    228 }
    229 
    230 U_CFUNC uint32_t U_EXPORT2 ucol_getCEStrengthDifference(uint32_t CE, uint32_t contCE,
    231                                                        uint32_t prevCE, uint32_t prevContCE)
    232 {
    233     if(prevCE == CE && prevContCE == contCE) {
    234         return UCOL_IDENTICAL;
    235     }
    236     if((prevCE & strengthMask[UCOL_PRIMARY]) != (CE & strengthMask[UCOL_PRIMARY])
    237         || (prevContCE & strengthMask[UCOL_PRIMARY]) != (contCE & strengthMask[UCOL_PRIMARY]))
    238     {
    239         return UCOL_PRIMARY;
    240     }
    241     if((prevCE & strengthMask[UCOL_SECONDARY]) != (CE & strengthMask[UCOL_SECONDARY])
    242         || (prevContCE & strengthMask[UCOL_SECONDARY]) != (contCE & strengthMask[UCOL_SECONDARY]))
    243     {
    244         return UCOL_SECONDARY;
    245     }
    246     return UCOL_TERTIARY;
    247 }
    248 
    249 
    250 /*static
    251 inline int32_t ucol_inv_getPrevious(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
    252 
    253     uint32_t CE = lh->baseCE;
    254     uint32_t SecondCE = lh->baseContCE;
    255 
    256     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    257     uint32_t previousCE, previousContCE;
    258     int32_t iCE;
    259 
    260     iCE = ucol_inv_findCE(src, CE, SecondCE);
    261 
    262     if(iCE<0) {
    263         return -1;
    264     }
    265 
    266     CE &= strengthMask[strength];
    267     SecondCE &= strengthMask[strength];
    268 
    269     previousCE = CE;
    270     previousContCE = SecondCE;
    271 
    272     while((previousCE  & strengthMask[strength]) == CE && (previousContCE  & strengthMask[strength])== SecondCE) {
    273         previousCE = (*(CETable+3*(--iCE)));
    274         previousContCE = (*(CETable+3*(iCE)+1));
    275     }
    276     lh->previousCE = previousCE;
    277     lh->previousContCE = previousContCE;
    278 
    279     return iCE;
    280 }*/
    281 
    282 static
    283 inline int32_t ucol_inv_getNext(UColTokenParser *src, UColTokListHeader *lh, uint32_t strength) {
    284     uint32_t CE = lh->baseCE;
    285     uint32_t SecondCE = lh->baseContCE;
    286 
    287     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    288     uint32_t nextCE, nextContCE;
    289     int32_t iCE;
    290 
    291     iCE = ucol_inv_findCE(src, CE, SecondCE);
    292 
    293     if(iCE<0) {
    294         return -1;
    295     }
    296 
    297     CE &= strengthMask[strength];
    298     SecondCE &= strengthMask[strength];
    299 
    300     nextCE = CE;
    301     nextContCE = SecondCE;
    302 
    303     while((nextCE  & strengthMask[strength]) == CE
    304         && (nextContCE  & strengthMask[strength]) == SecondCE)
    305     {
    306         nextCE = (*(CETable+3*(++iCE)));
    307         nextContCE = (*(CETable+3*(iCE)+1));
    308     }
    309 
    310     lh->nextCE = nextCE;
    311     lh->nextContCE = nextContCE;
    312 
    313     return iCE;
    314 }
    315 
    316 static void ucol_inv_getGapPositions(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
    317     /* reset all the gaps */
    318     int32_t i = 0;
    319     uint32_t *CETable = (uint32_t *)((uint8_t *)src->invUCA+src->invUCA->table);
    320     uint32_t st = 0;
    321     uint32_t t1, t2;
    322     int32_t pos;
    323 
    324     UColToken *tok = lh->first;
    325     uint32_t tokStrength = tok->strength;
    326 
    327     for(i = 0; i<3; i++) {
    328         lh->gapsHi[3*i] = 0;
    329         lh->gapsHi[3*i+1] = 0;
    330         lh->gapsHi[3*i+2] = 0;
    331         lh->gapsLo[3*i] = 0;
    332         lh->gapsLo[3*i+1] = 0;
    333         lh->gapsLo[3*i+2] = 0;
    334         lh->numStr[i] = 0;
    335         lh->fStrToken[i] = NULL;
    336         lh->lStrToken[i] = NULL;
    337         lh->pos[i] = -1;
    338     }
    339 
    340     UCAConstants *consts = (UCAConstants *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts);
    341 
    342     if((lh->baseCE & 0xFF000000)>= (consts->UCA_PRIMARY_IMPLICIT_MIN<<24) && (lh->baseCE & 0xFF000000) <= (consts->UCA_PRIMARY_IMPLICIT_MAX<<24) ) { /* implicits - */
    343         //if(lh->baseCE >= PRIMARY_IMPLICIT_MIN && lh->baseCE < PRIMARY_IMPLICIT_MAX ) { /* implicits - */
    344         lh->pos[0] = 0;
    345         t1 = lh->baseCE;
    346         t2 = lh->baseContCE & UCOL_REMOVE_CONTINUATION;
    347         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    348         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    349         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    350         uint32_t primaryCE = t1 & UCOL_PRIMARYMASK | (t2 & UCOL_PRIMARYMASK) >> 16;
    351         primaryCE = uprv_uca_getImplicitFromRaw(uprv_uca_getRawFromImplicit(primaryCE)+1);
    352 
    353         t1 = primaryCE & UCOL_PRIMARYMASK | 0x0505;
    354         t2 = (primaryCE << 16) & UCOL_PRIMARYMASK; // | UCOL_CONTINUATION_MARKER;
    355 
    356         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    357         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    358         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    359     } else if(lh->indirect == TRUE && lh->nextCE != 0) {
    360         //} else if(lh->baseCE == UCOL_RESET_TOP_VALUE && lh->baseContCE == 0) {
    361         lh->pos[0] = 0;
    362         t1 = lh->baseCE;
    363         t2 = lh->baseContCE&UCOL_REMOVE_CONTINUATION;
    364         lh->gapsLo[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    365         lh->gapsLo[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    366         lh->gapsLo[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    367         t1 = lh->nextCE;
    368         t2 = lh->nextContCE&UCOL_REMOVE_CONTINUATION;
    369         lh->gapsHi[0] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    370         lh->gapsHi[1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    371         lh->gapsHi[2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    372     } else {
    373         for(;;) {
    374             if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
    375                 if((lh->pos[tokStrength] = ucol_inv_getNext(src, lh, tokStrength)) >= 0) {
    376                     lh->fStrToken[tokStrength] = tok;
    377                 } else { /* The CE must be implicit, since it's not in the table */
    378                     /* Error */
    379                     *status = U_INTERNAL_PROGRAM_ERROR;
    380                 }
    381             }
    382 
    383             while(tok != NULL && tok->strength >= tokStrength) {
    384                 if(tokStrength < UCOL_CE_STRENGTH_LIMIT) {
    385                     lh->lStrToken[tokStrength] = tok;
    386                 }
    387                 tok = tok->next;
    388             }
    389             if(tokStrength < UCOL_CE_STRENGTH_LIMIT-1) {
    390                 /* check if previous interval is the same and merge the intervals if it is so */
    391                 if(lh->pos[tokStrength] == lh->pos[tokStrength+1]) {
    392                     lh->fStrToken[tokStrength] = lh->fStrToken[tokStrength+1];
    393                     lh->fStrToken[tokStrength+1] = NULL;
    394                     lh->lStrToken[tokStrength+1] = NULL;
    395                     lh->pos[tokStrength+1] = -1;
    396                 }
    397             }
    398             if(tok != NULL) {
    399                 tokStrength = tok->strength;
    400             } else {
    401                 break;
    402             }
    403         }
    404         for(st = 0; st < 3; st++) {
    405             if((pos = lh->pos[st]) >= 0) {
    406                 t1 = *(CETable+3*(pos));
    407                 t2 = *(CETable+3*(pos)+1);
    408                 lh->gapsHi[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    409                 lh->gapsHi[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    410                 //lh->gapsHi[3*st+2] = (UCOL_TERTIARYORDER(t1)) << 24 | (UCOL_TERTIARYORDER(t2)) << 16;
    411                 lh->gapsHi[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
    412                 //pos--;
    413                 //t1 = *(CETable+3*(pos));
    414                 //t2 = *(CETable+3*(pos)+1);
    415                 t1 = lh->baseCE;
    416                 t2 = lh->baseContCE;
    417                 lh->gapsLo[3*st] = (t1 & UCOL_PRIMARYMASK) | (t2 & UCOL_PRIMARYMASK) >> 16;
    418                 lh->gapsLo[3*st+1] = (t1 & UCOL_SECONDARYMASK) << 16 | (t2 & UCOL_SECONDARYMASK) << 8;
    419                 lh->gapsLo[3*st+2] = (t1&0x3f) << 24 | (t2&0x3f) << 16;
    420             }
    421         }
    422     }
    423 }
    424 
    425 
    426 #define ucol_countBytes(value, noOfBytes)   \
    427 {                               \
    428     uint32_t mask = 0xFFFFFFFF;   \
    429     (noOfBytes) = 0;              \
    430     while(mask != 0) {            \
    431     if(((value) & mask) != 0) { \
    432     (noOfBytes)++;            \
    433     }                           \
    434     mask >>= 8;                 \
    435     }                             \
    436 }
    437 
    438 static uint32_t ucol_getNextGenerated(ucolCEGenerator *g, UErrorCode *status) {
    439     if(U_SUCCESS(*status)) {
    440         g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
    441     }
    442     return g->current;
    443 }
    444 
    445 static uint32_t ucol_getSimpleCEGenerator(ucolCEGenerator *g, UColToken *tok, uint32_t strength, UErrorCode *status) {
    446     /* TODO: rename to enum names */
    447     uint32_t high, low, count=1;
    448     uint32_t maxByte = (strength == UCOL_TERTIARY)?0x3F:0xFF;
    449 
    450     if(strength == UCOL_SECONDARY) {
    451         low = UCOL_COMMON_TOP2<<24;
    452         high = 0xFFFFFFFF;
    453         count = 0xFF - UCOL_COMMON_TOP2;
    454     } else {
    455         low = UCOL_BYTE_COMMON << 24; //0x05000000;
    456         high = 0x40000000;
    457         count = 0x40 - UCOL_BYTE_COMMON;
    458     }
    459 
    460     if(tok->next != NULL && tok->next->strength == strength) {
    461         count = tok->next->toInsert;
    462     }
    463 
    464     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
    465     g->current = UCOL_BYTE_COMMON<<24;
    466 
    467     if(g->noOfRanges == 0) {
    468         *status = U_INTERNAL_PROGRAM_ERROR;
    469     }
    470     return g->current;
    471 }
    472 
    473 static uint32_t ucol_getCEGenerator(ucolCEGenerator *g, uint32_t* lows, uint32_t* highs, UColToken *tok, uint32_t fStrength, UErrorCode *status) {
    474     uint32_t strength = tok->strength;
    475     uint32_t low = lows[fStrength*3+strength];
    476     uint32_t high = highs[fStrength*3+strength];
    477     uint32_t maxByte = 0;
    478     if(strength == UCOL_TERTIARY) {
    479         maxByte = 0x3F;
    480     } else if(strength == UCOL_PRIMARY) {
    481         maxByte = 0xFE;
    482     } else {
    483         maxByte = 0xFF;
    484     }
    485 
    486     uint32_t count = tok->toInsert;
    487 
    488     if(low >= high && strength > UCOL_PRIMARY) {
    489         int32_t s = strength;
    490         for(;;) {
    491             s--;
    492             if(lows[fStrength*3+s] != highs[fStrength*3+s]) {
    493                 if(strength == UCOL_SECONDARY) {
    494                     if (low < UCOL_COMMON_TOP2<<24 ) {
    495                        // Override if low range is less than UCOL_COMMON_TOP2.
    496 		        low = UCOL_COMMON_TOP2<<24;
    497                     }
    498                     high = 0xFFFFFFFF;
    499                 } else {
    500                     // Override if low range is less than UCOL_COMMON_BOT3.
    501 		    if ( low < UCOL_COMMON_BOT3<<24 ) {
    502                         low = UCOL_COMMON_BOT3<<24;
    503 		    }
    504                     high = 0x40000000;
    505                 }
    506                 break;
    507             }
    508             if(s<0) {
    509                 *status = U_INTERNAL_PROGRAM_ERROR;
    510                 return 0;
    511             }
    512         }
    513     }
    514 
    515     if(low == 0) {
    516         low = 0x01000000;
    517     }
    518 
    519     if(strength == UCOL_SECONDARY) { /* similar as simple */
    520         if(low >= (UCOL_COMMON_BOT2<<24) && low < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
    521             low = UCOL_COMMON_TOP2<<24;
    522         }
    523         if(high > (UCOL_COMMON_BOT2<<24) && high < (uint32_t)(UCOL_COMMON_TOP2<<24)) {
    524             high = UCOL_COMMON_TOP2<<24;
    525         }
    526         if(low < (UCOL_COMMON_BOT2<<24)) {
    527             g->noOfRanges = ucol_allocWeights(UCOL_BYTE_UNSHIFTED_MIN<<24, high, count, maxByte, g->ranges);
    528             g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
    529             //g->current = UCOL_COMMON_BOT2<<24;
    530             return g->current;
    531         }
    532     }
    533 
    534     g->noOfRanges = ucol_allocWeights(low, high, count, maxByte, g->ranges);
    535     if(g->noOfRanges == 0) {
    536         *status = U_INTERNAL_PROGRAM_ERROR;
    537     }
    538     g->current = ucol_nextWeight(g->ranges, &g->noOfRanges);
    539     return g->current;
    540 }
    541 
    542 static
    543 uint32_t u_toLargeKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
    544     uint32_t i = 0;
    545     UChar c;
    546 
    547     if(U_FAILURE(*status)) {
    548         return 0;
    549     }
    550 
    551     if(sourceLen > resLen) {
    552         *status = U_MEMORY_ALLOCATION_ERROR;
    553         return 0;
    554     }
    555 
    556     for(i = 0; i < sourceLen; i++) {
    557         c = source[i];
    558         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
    559             switch(c - 0x3000) {
    560             case 0x41: case 0x43: case 0x45: case 0x47: case 0x49: case 0x63: case 0x83: case 0x85: case 0x8E:
    561             case 0xA1: case 0xA3: case 0xA5: case 0xA7: case 0xA9: case 0xC3: case 0xE3: case 0xE5: case 0xEE:
    562                 c++;
    563                 break;
    564             case 0xF5:
    565                 c = 0x30AB;
    566                 break;
    567             case 0xF6:
    568                 c = 0x30B1;
    569                 break;
    570             }
    571         }
    572         resBuf[i] = c;
    573     }
    574     return sourceLen;
    575 }
    576 
    577 static
    578 uint32_t u_toSmallKana(const UChar *source, const uint32_t sourceLen, UChar *resBuf, const uint32_t resLen, UErrorCode *status) {
    579     uint32_t i = 0;
    580     UChar c;
    581 
    582     if(U_FAILURE(*status)) {
    583         return 0;
    584     }
    585 
    586     if(sourceLen > resLen) {
    587         *status = U_MEMORY_ALLOCATION_ERROR;
    588         return 0;
    589     }
    590 
    591     for(i = 0; i < sourceLen; i++) {
    592         c = source[i];
    593         if(0x3041 <= c && c <= 0x30FA) { /* Kana range */
    594             switch(c - 0x3000) {
    595             case 0x42: case 0x44: case 0x46: case 0x48: case 0x4A: case 0x64: case 0x84: case 0x86: case 0x8F:
    596             case 0xA2: case 0xA4: case 0xA6: case 0xA8: case 0xAA: case 0xC4: case 0xE4: case 0xE6: case 0xEF:
    597                 c--;
    598                 break;
    599             case 0xAB:
    600                 c = 0x30F5;
    601                 break;
    602             case 0xB1:
    603                 c = 0x30F6;
    604                 break;
    605             }
    606         }
    607         resBuf[i] = c;
    608     }
    609     return sourceLen;
    610 }
    611 
    612 static
    613 uint8_t ucol_uprv_getCaseBits(const UCollator *UCA, const UChar *src, uint32_t len, UErrorCode *status) {
    614     uint32_t i = 0;
    615     UChar n[128];
    616     uint32_t nLen = 0;
    617     uint32_t uCount = 0, lCount = 0;
    618 
    619     collIterate s;
    620     uint32_t order = 0;
    621 
    622     if(U_FAILURE(*status)) {
    623         return UCOL_LOWER_CASE;
    624     }
    625 
    626     nLen = unorm_normalize(src, len, UNORM_NFKD, 0, n, 128, status);
    627     if(U_SUCCESS(*status)) {
    628         for(i = 0; i < nLen; i++) {
    629             uprv_init_collIterate(UCA, &n[i], 1, &s);
    630             order = ucol_getNextCE(UCA, &s, status);
    631             if(isContinuation(order)) {
    632                 *status = U_INTERNAL_PROGRAM_ERROR;
    633                 return UCOL_LOWER_CASE;
    634             }
    635             if((order&UCOL_CASE_BIT_MASK)== UCOL_UPPER_CASE) {
    636                 uCount++;
    637             } else {
    638                 if(u_islower(n[i])) {
    639                     lCount++;
    640                 } else if(U_SUCCESS(*status)) {
    641                     UChar sk[1], lk[1];
    642                     u_toSmallKana(&n[i], 1, sk, 1, status);
    643                     u_toLargeKana(&n[i], 1, lk, 1, status);
    644                     if(sk[0] == n[i] && lk[0] != n[i]) {
    645                         lCount++;
    646                     }
    647                 }
    648             }
    649         }
    650     }
    651 
    652     if(uCount != 0 && lCount != 0) {
    653         return UCOL_MIXED_CASE;
    654     } else if(uCount != 0) {
    655         return UCOL_UPPER_CASE;
    656     } else {
    657         return UCOL_LOWER_CASE;
    658     }
    659 }
    660 
    661 
    662 U_CFUNC void ucol_doCE(UColTokenParser *src, uint32_t *CEparts, UColToken *tok, UErrorCode *status) {
    663     /* this one makes the table and stuff */
    664     uint32_t noOfBytes[3];
    665     uint32_t i;
    666 
    667     for(i = 0; i<3; i++) {
    668         ucol_countBytes(CEparts[i], noOfBytes[i]);
    669     }
    670 
    671     /* Here we have to pack CEs from parts */
    672 
    673     uint32_t CEi = 0;
    674     uint32_t value = 0;
    675 
    676     while(2*CEi<noOfBytes[0] || CEi<noOfBytes[1] || CEi<noOfBytes[2]) {
    677         if(CEi > 0) {
    678             value = UCOL_CONTINUATION_MARKER; /* Continuation marker */
    679         } else {
    680             value = 0;
    681         }
    682 
    683         if(2*CEi<noOfBytes[0]) {
    684             value |= ((CEparts[0]>>(32-16*(CEi+1))) & 0xFFFF) << 16;
    685         }
    686         if(CEi<noOfBytes[1]) {
    687             value |= ((CEparts[1]>>(32-8*(CEi+1))) & 0xFF) << 8;
    688         }
    689         if(CEi<noOfBytes[2]) {
    690             value |= ((CEparts[2]>>(32-8*(CEi+1))) & 0x3F);
    691         }
    692         tok->CEs[CEi] = value;
    693         CEi++;
    694     }
    695     if(CEi == 0) { /* totally ignorable */
    696         tok->noOfCEs = 1;
    697         tok->CEs[0] = 0;
    698     } else { /* there is at least something */
    699         tok->noOfCEs = CEi;
    700     }
    701 
    702 
    703     // we want to set case bits here and now, not later.
    704     // Case bits handling
    705     if(tok->CEs[0] != 0) { // case bits should be set only for non-ignorables
    706         tok->CEs[0] &= 0xFFFFFF3F; // Clean the case bits field
    707         int32_t cSize = (tok->source & 0xFF000000) >> 24;
    708         UChar *cPoints = (tok->source & 0x00FFFFFF) + src->source;
    709 
    710         if(cSize > 1) {
    711             // Do it manually
    712             tok->CEs[0] |= ucol_uprv_getCaseBits(src->UCA, cPoints, cSize, status);
    713         } else {
    714             // Copy it from the UCA
    715             uint32_t caseCE = ucol_getFirstCE(src->UCA, cPoints[0], status);
    716             tok->CEs[0] |= (caseCE & 0xC0);
    717         }
    718     }
    719 
    720 #if UCOL_DEBUG==2
    721     fprintf(stderr, "%04X str: %i, [%08X, %08X, %08X]: tok: ", tok->debugSource, tok->strength, CEparts[0] >> (32-8*noOfBytes[0]), CEparts[1] >> (32-8*noOfBytes[1]), CEparts[2]>> (32-8*noOfBytes[2]));
    722     for(i = 0; i<tok->noOfCEs; i++) {
    723         fprintf(stderr, "%08X ", tok->CEs[i]);
    724     }
    725     fprintf(stderr, "\n");
    726 #endif
    727 }
    728 
    729 U_CFUNC void ucol_initBuffers(UColTokenParser *src, UColTokListHeader *lh, UErrorCode *status) {
    730     ucolCEGenerator Gens[UCOL_CE_STRENGTH_LIMIT];
    731     uint32_t CEparts[UCOL_CE_STRENGTH_LIMIT];
    732 
    733     UColToken *tok = lh->last;
    734     uint32_t t[UCOL_STRENGTH_LIMIT];
    735 
    736     uprv_memset(t, 0, UCOL_STRENGTH_LIMIT*sizeof(uint32_t));
    737 
    738     tok->toInsert = 1;
    739     t[tok->strength] = 1;
    740 
    741     while(tok->previous != NULL) {
    742         if(tok->previous->strength < tok->strength) { /* going up */
    743             t[tok->strength] = 0;
    744             t[tok->previous->strength]++;
    745         } else if(tok->previous->strength > tok->strength) { /* going down */
    746             t[tok->previous->strength] = 1;
    747         } else {
    748             t[tok->strength]++;
    749         }
    750         tok=tok->previous;
    751         tok->toInsert = t[tok->strength];
    752     }
    753 
    754     tok->toInsert = t[tok->strength];
    755     ucol_inv_getGapPositions(src, lh, status);
    756 
    757 #if UCOL_DEBUG
    758     fprintf(stderr, "BaseCE: %08X %08X\n", lh->baseCE, lh->baseContCE);
    759     int32_t j = 2;
    760     for(j = 2; j >= 0; j--) {
    761         fprintf(stderr, "gapsLo[%i] [%08X %08X %08X]\n", j, lh->gapsLo[j*3], lh->gapsLo[j*3+1], lh->gapsLo[j*3+2]);
    762         fprintf(stderr, "gapsHi[%i] [%08X %08X %08X]\n", j, lh->gapsHi[j*3], lh->gapsHi[j*3+1], lh->gapsHi[j*3+2]);
    763     }
    764     tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
    765 
    766     do {
    767         fprintf(stderr,"%i", tok->strength);
    768         tok = tok->next;
    769     } while(tok != NULL);
    770     fprintf(stderr, "\n");
    771 
    772     tok=lh->first[UCOL_TOK_POLARITY_POSITIVE];
    773 
    774     do {
    775         fprintf(stderr,"%i", tok->toInsert);
    776         tok = tok->next;
    777     } while(tok != NULL);
    778 #endif
    779 
    780     tok = lh->first;
    781     uint32_t fStrength = UCOL_IDENTICAL;
    782     uint32_t initStrength = UCOL_IDENTICAL;
    783 
    784 
    785     CEparts[UCOL_PRIMARY] = (lh->baseCE & UCOL_PRIMARYMASK) | (lh->baseContCE & UCOL_PRIMARYMASK) >> 16;
    786     CEparts[UCOL_SECONDARY] = (lh->baseCE & UCOL_SECONDARYMASK) << 16 | (lh->baseContCE & UCOL_SECONDARYMASK) << 8;
    787     CEparts[UCOL_TERTIARY] = (UCOL_TERTIARYORDER(lh->baseCE)) << 24 | (UCOL_TERTIARYORDER(lh->baseContCE)) << 16;
    788 
    789     while (tok != NULL && U_SUCCESS(*status)) {
    790         fStrength = tok->strength;
    791         if(fStrength < initStrength) {
    792             initStrength = fStrength;
    793             if(lh->pos[fStrength] == -1) {
    794                 while(lh->pos[fStrength] == -1 && fStrength > 0) {
    795                     fStrength--;
    796                 }
    797                 if(lh->pos[fStrength] == -1) {
    798                     *status = U_INTERNAL_PROGRAM_ERROR;
    799                     return;
    800                 }
    801             }
    802             if(initStrength == UCOL_TERTIARY) { /* starting with tertiary */
    803                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
    804                 CEparts[UCOL_SECONDARY] = lh->gapsLo[fStrength*3+1];
    805                 /*CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[2], lh->gapsLo[fStrength*3+2], lh->gapsHi[fStrength*3+2], tok, UCOL_TERTIARY); */
    806                 CEparts[UCOL_TERTIARY] = ucol_getCEGenerator(&Gens[UCOL_TERTIARY], lh->gapsLo, lh->gapsHi, tok, fStrength, status);
    807             } else if(initStrength == UCOL_SECONDARY) { /* secondaries */
    808                 CEparts[UCOL_PRIMARY] = lh->gapsLo[fStrength*3];
    809                 /*CEparts[1] = ucol_getCEGenerator(&Gens[1], lh->gapsLo[fStrength*3+1], lh->gapsHi[fStrength*3+1], tok, 1);*/
    810                 CEparts[UCOL_SECONDARY] = ucol_getCEGenerator(&Gens[UCOL_SECONDARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
    811                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    812             } else { /* primaries */
    813                 /*CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[0], lh->gapsLo[0], lh->gapsHi[0], tok, UCOL_PRIMARY);*/
    814                 CEparts[UCOL_PRIMARY] = ucol_getCEGenerator(&Gens[UCOL_PRIMARY], lh->gapsLo, lh->gapsHi, tok, fStrength,  status);
    815                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
    816                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    817             }
    818         } else {
    819             if(tok->strength == UCOL_TERTIARY) {
    820                 CEparts[UCOL_TERTIARY] = ucol_getNextGenerated(&Gens[UCOL_TERTIARY], status);
    821             } else if(tok->strength == UCOL_SECONDARY) {
    822                 CEparts[UCOL_SECONDARY] = ucol_getNextGenerated(&Gens[UCOL_SECONDARY], status);
    823                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    824             } else if(tok->strength == UCOL_PRIMARY) {
    825                 CEparts[UCOL_PRIMARY] = ucol_getNextGenerated(&Gens[UCOL_PRIMARY], status);
    826                 CEparts[UCOL_SECONDARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_SECONDARY], tok, UCOL_SECONDARY, status);
    827                 CEparts[UCOL_TERTIARY] = ucol_getSimpleCEGenerator(&Gens[UCOL_TERTIARY], tok, UCOL_TERTIARY, status);
    828             }
    829         }
    830         ucol_doCE(src, CEparts, tok, status);
    831         tok = tok->next;
    832     }
    833 }
    834 
    835 U_CFUNC void ucol_createElements(UColTokenParser *src, tempUCATable *t, UColTokListHeader *lh, UErrorCode *status) {
    836     UCAElements el;
    837     UColToken *tok = lh->first;
    838     UColToken *expt = NULL;
    839     uint32_t i = 0, j = 0;
    840     UChar32 fcdHighStart;
    841     const uint16_t *fcdTrieIndex = unorm_getFCDTrieIndex(fcdHighStart, status);
    842 
    843     while(tok != NULL && U_SUCCESS(*status)) {
    844         /* first, check if there are any expansions */
    845         /* if there are expansions, we need to do a little bit more processing */
    846         /* since parts of expansion can be tailored, while others are not */
    847         if(tok->expansion != 0) {
    848             uint32_t len = tok->expansion >> 24;
    849             uint32_t currentSequenceLen = len;
    850             uint32_t expOffset = tok->expansion & 0x00FFFFFF;
    851             //uint32_t exp = currentSequenceLen | expOffset;
    852             UColToken exp;
    853             exp.source = currentSequenceLen | expOffset;
    854             exp.rulesToParse = src->source;
    855 
    856             while(len > 0) {
    857                 currentSequenceLen = len;
    858                 while(currentSequenceLen > 0) {
    859                     exp.source = (currentSequenceLen << 24) | expOffset;
    860                     if((expt = (UColToken *)uhash_get(src->tailored, &exp)) != NULL && expt->strength != UCOL_TOK_RESET) { /* expansion is tailored */
    861                         uint32_t noOfCEsToCopy = expt->noOfCEs;
    862                         for(j = 0; j<noOfCEsToCopy; j++) {
    863                             tok->expCEs[tok->noOfExpCEs + j] = expt->CEs[j];
    864                         }
    865                         tok->noOfExpCEs += noOfCEsToCopy;
    866                         // Smart people never try to add codepoints and CEs.
    867                         // For some odd reason, it won't work.
    868                         expOffset += currentSequenceLen; //noOfCEsToCopy;
    869                         len -= currentSequenceLen; //noOfCEsToCopy;
    870                         break;
    871                     } else {
    872                         currentSequenceLen--;
    873                     }
    874                 }
    875                 if(currentSequenceLen == 0) { /* couldn't find any tailored subsequence */
    876                     /* will have to get one from UCA */
    877                     /* first, get the UChars from the rules */
    878                     /* then pick CEs out until there is no more and stuff them into expansion */
    879                     collIterate s;
    880                     uint32_t order = 0;
    881                     uprv_init_collIterate(src->UCA, expOffset + src->source, 1, &s);
    882 
    883                     for(;;) {
    884                         order = ucol_getNextCE(src->UCA, &s, status);
    885                         if(order == UCOL_NO_MORE_CES) {
    886                             break;
    887                         }
    888                         tok->expCEs[tok->noOfExpCEs++] = order;
    889                     }
    890                     expOffset++;
    891                     len--;
    892                 }
    893             }
    894         } else {
    895             tok->noOfExpCEs = 0;
    896         }
    897 
    898         /* set the ucaelement with obtained values */
    899         el.noOfCEs = tok->noOfCEs + tok->noOfExpCEs;
    900         /* copy CEs */
    901         for(i = 0; i<tok->noOfCEs; i++) {
    902             el.CEs[i] = tok->CEs[i];
    903         }
    904         for(i = 0; i<tok->noOfExpCEs; i++) {
    905             el.CEs[i+tok->noOfCEs] = tok->expCEs[i];
    906         }
    907 
    908         /* copy UChars */
    909         // We kept prefix and source kind of together, as it is a kind of a contraction.
    910         // However, now we have to slice the prefix off the main thing -
    911         el.prefix = el.prefixChars;
    912         el.cPoints = el.uchars;
    913         if(tok->prefix != 0) { // we will just copy the prefix here, and adjust accordingly in the
    914             // addPrefix function in ucol_elm. The reason is that we need to add both composed AND
    915             // decomposed elements to the unsaf table.
    916             el.prefixSize = tok->prefix>>24;
    917             uprv_memcpy(el.prefix, src->source + (tok->prefix & 0x00FFFFFF), el.prefixSize*sizeof(UChar));
    918 
    919             el.cSize = (tok->source >> 24)-(tok->prefix>>24);
    920             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF)+(tok->prefix>>24) + src->source, el.cSize*sizeof(UChar));
    921         } else {
    922             el.prefixSize = 0;
    923             *el.prefix = 0;
    924 
    925             el.cSize = (tok->source >> 24);
    926             uprv_memcpy(el.uchars, (tok->source & 0x00FFFFFF) + src->source, el.cSize*sizeof(UChar));
    927         }
    928         if(src->UCA != NULL) {
    929             for(i = 0; i<el.cSize; i++) {
    930                 if(UCOL_ISJAMO(el.cPoints[i])) {
    931                     t->image->jamoSpecial = TRUE;
    932                 }
    933             }
    934             if (!src->buildCCTabFlag && el.cSize > 0) {
    935                 // Check the trailing canonical combining class (tccc) of the last character.
    936                 const UChar *s = el.cPoints + el.cSize;
    937                 uint16_t fcd = unorm_prevFCD16(fcdTrieIndex, fcdHighStart, el.cPoints, s);
    938                 if ((fcd & 0xff) != 0) {
    939                     src->buildCCTabFlag = TRUE;
    940                 }
    941             }
    942         }
    943 
    944         /* and then, add it */
    945 #if UCOL_DEBUG==2
    946         fprintf(stderr, "Adding: %04X with %08X\n", el.cPoints[0], el.CEs[0]);
    947 #endif
    948         uprv_uca_addAnElement(t, &el, status);
    949 
    950 #if UCOL_DEBUG_DUPLICATES
    951         if(*status != U_ZERO_ERROR) {
    952             fprintf(stderr, "replaced CE for %04X with CE for %04X\n", el.cPoints[0], tok->debugSource);
    953             *status = U_ZERO_ERROR;
    954         }
    955 #endif
    956 
    957         tok = tok->next;
    958     }
    959 }
    960 
    961 U_CDECL_BEGIN
    962 static UBool U_CALLCONV
    963 _processUCACompleteIgnorables(const void *context, UChar32 start, UChar32 limit, uint32_t value) {
    964     UErrorCode status = U_ZERO_ERROR;
    965     tempUCATable *t = (tempUCATable *)context;
    966     if(value == 0) {
    967         while(start < limit) {
    968             uint32_t CE = utrie_get32(t->mapping, start, NULL);
    969             if(CE == UCOL_NOT_FOUND) {
    970                 UCAElements el;
    971                 el.isThai = FALSE;
    972                 el.prefixSize = 0;
    973                 el.prefixChars[0] = 0;
    974                 el.prefix = el.prefixChars;
    975                 el.cPoints = el.uchars;
    976 
    977                 el.cSize = 0;
    978                 UTF_APPEND_CHAR(el.uchars, el.cSize, 1024, start);
    979 
    980                 el.noOfCEs = 1;
    981                 el.CEs[0] = 0;
    982                 uprv_uca_addAnElement(t, &el, &status);
    983 
    984             }
    985             start++;
    986         }
    987     }
    988     if(U_FAILURE(status)) {
    989         return FALSE;
    990     } else {
    991         return TRUE;
    992     }
    993 }
    994 U_CDECL_END
    995 
    996 static void
    997 ucol_uprv_bld_copyRangeFromUCA(UColTokenParser *src, tempUCATable *t,
    998                                UChar32 start, UChar32 end,
    999                                UErrorCode *status)
   1000 {
   1001     //UChar decomp[256];
   1002     uint32_t CE = UCOL_NOT_FOUND;
   1003     UChar32 u = 0;
   1004     UCAElements el;
   1005     el.isThai = FALSE;
   1006     el.prefixSize = 0;
   1007     el.prefixChars[0] = 0;
   1008     collIterate colIt;
   1009 
   1010     if(U_SUCCESS(*status)) {
   1011         for(u = start; u<=end; u++) {
   1012             if((CE = utrie_get32(t->mapping, u, NULL)) == UCOL_NOT_FOUND
   1013                 /* this test is for contractions that are missing the starting element. */
   1014                 || ((isCntTableElement(CE)) &&
   1015                 (uprv_cnttab_getCE(t->contractions, CE, 0, status) == UCOL_NOT_FOUND))
   1016                 )
   1017             {
   1018                 el.cSize = 0;
   1019                 U16_APPEND_UNSAFE(el.uchars, el.cSize, u);
   1020                 //decomp[0] = (UChar)u;
   1021                 //el.uchars[0] = (UChar)u;
   1022                 el.cPoints = el.uchars;
   1023                 //el.cSize = 1;
   1024                 el.noOfCEs = 0;
   1025                 el.prefix = el.prefixChars;
   1026                 el.prefixSize = 0;
   1027                 //uprv_init_collIterate(src->UCA, decomp, 1, &colIt);
   1028                 // We actually want to check whether this element is a special
   1029                 // If it is an implicit element (hangul, CJK - we want to copy the
   1030                 // special, not the resolved CEs) - for hangul, copying resolved
   1031                 // would just make things the same (there is an expansion and it
   1032                 // takes approximately the same amount of time to resolve as
   1033                 // falling back to the UCA).
   1034                 /*
   1035                 UTRIE_GET32(src->UCA->mapping, u, CE);
   1036                 tag = getCETag(CE);
   1037                 if(tag == HANGUL_SYLLABLE_TAG || tag == CJK_IMPLICIT_TAG
   1038                 || tag == IMPLICIT_TAG || tag == TRAIL_SURROGATE_TAG
   1039                 || tag == LEAD_SURROGATE_TAG) {
   1040                 el.CEs[el.noOfCEs++] = CE;
   1041                 } else {
   1042                 */
   1043                 // It turns out that it does not make sense to keep implicits
   1044                 // unresolved. The cost of resolving them is big enough so that
   1045                 // it doesn't make any difference whether we have to go to the UCA
   1046                 // or not.
   1047                 {
   1048                     uprv_init_collIterate(src->UCA, el.uchars, el.cSize, &colIt);
   1049                     while(CE != UCOL_NO_MORE_CES) {
   1050                         CE = ucol_getNextCE(src->UCA, &colIt, status);
   1051                         if(CE != UCOL_NO_MORE_CES) {
   1052                             el.CEs[el.noOfCEs++] = CE;
   1053                         }
   1054                     }
   1055                 }
   1056                 uprv_uca_addAnElement(t, &el, status);
   1057             }
   1058         }
   1059     }
   1060 }
   1061 
   1062 UCATableHeader *ucol_assembleTailoringTable(UColTokenParser *src, UErrorCode *status) {
   1063     U_NAMESPACE_USE
   1064 
   1065     uint32_t i = 0;
   1066     if(U_FAILURE(*status)) {
   1067         return NULL;
   1068     }
   1069     /*
   1070     2.  Eliminate the negative lists by doing the following for each non-null negative list:
   1071     o   if previousCE(baseCE, strongestN) != some ListHeader X's baseCE,
   1072     create new ListHeader X
   1073     o   reverse the list, add to the end of X's positive list. Reset the strength of the
   1074     first item you add, based on the stronger strength levels of the two lists.
   1075     */
   1076     /*
   1077     3.  For each ListHeader with a non-null positive list:
   1078     */
   1079     /*
   1080     o   Find all character strings with CEs between the baseCE and the
   1081     next/previous CE, at the strength of the first token. Add these to the
   1082     tailoring.
   1083     ? That is, if UCA has ...  x <<< X << x' <<< X' < y ..., and the
   1084     tailoring has & x < z...
   1085     ? Then we change the tailoring to & x  <<< X << x' <<< X' < z ...
   1086     */
   1087     /* It is possible that this part should be done even while constructing list */
   1088     /* The problem is that it is unknown what is going to be the strongest weight */
   1089     /* So we might as well do it here */
   1090 
   1091     /*
   1092     o   Allocate CEs for each token in the list, based on the total number N of the
   1093     largest level difference, and the gap G between baseCE and nextCE at that
   1094     level. The relation * between the last item and nextCE is the same as the
   1095     strongest strength.
   1096     o   Example: baseCE < a << b <<< q << c < d < e * nextCE(X,1)
   1097     ? There are 3 primary items: a, d, e. Fit them into the primary gap.
   1098     Then fit b and c into the secondary gap between a and d, then fit q
   1099     into the tertiary gap between b and c.
   1100 
   1101     o   Example: baseCE << b <<< q << c * nextCE(X,2)
   1102     ? There are 2 secondary items: b, c. Fit them into the secondary gap.
   1103     Then fit q into the tertiary gap between b and c.
   1104     o   When incrementing primary values, we will not cross high byte
   1105     boundaries except where there is only a single-byte primary. That is to
   1106     ensure that the script reordering will continue to work.
   1107     */
   1108     UCATableHeader *image = (UCATableHeader *)uprv_malloc(sizeof(UCATableHeader));
   1109     /* test for NULL */
   1110     if (image == NULL) {
   1111         *status = U_MEMORY_ALLOCATION_ERROR;
   1112         return NULL;
   1113     }
   1114     uprv_memcpy(image, src->UCA->image, sizeof(UCATableHeader));
   1115 
   1116     for(i = 0; i<src->resultLen; i++) {
   1117         /* now we need to generate the CEs */
   1118         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
   1119         /* According to strength                                                          */
   1120         if(U_SUCCESS(*status)) {
   1121             if(src->lh[i].first) { // if there are any elements
   1122                 // due to the way parser works, subsequent tailorings
   1123                 // may remove all the elements from a sequence, therefore
   1124                 // leaving an empty tailoring sequence.
   1125                 ucol_initBuffers(src, &src->lh[i], status);
   1126             }
   1127         }
   1128         if(U_FAILURE(*status)) {
   1129             uprv_free(image);
   1130             return NULL;
   1131         }
   1132     }
   1133 
   1134     if(src->varTop != NULL) { /* stuff the variable top value */
   1135         src->opts->variableTopValue = (*(src->varTop->CEs))>>16;
   1136         /* remove it from the list */
   1137         if(src->varTop->listHeader->first == src->varTop) { /* first in list */
   1138             src->varTop->listHeader->first = src->varTop->next;
   1139         }
   1140         if(src->varTop->listHeader->last == src->varTop) { /* first in list */
   1141             src->varTop->listHeader->last = src->varTop->previous;
   1142         }
   1143         if(src->varTop->next != NULL) {
   1144             src->varTop->next->previous = src->varTop->previous;
   1145         }
   1146         if(src->varTop->previous != NULL) {
   1147             src->varTop->previous->next = src->varTop->next;
   1148         }
   1149     }
   1150 
   1151 
   1152     tempUCATable *t = uprv_uca_initTempTable(image, src->opts, src->UCA, NOT_FOUND_TAG, NOT_FOUND_TAG, status);
   1153     if(U_FAILURE(*status)) {
   1154         uprv_free(image);
   1155         return NULL;
   1156     }
   1157 
   1158 
   1159     /* After this, we have assigned CE values to all regular CEs      */
   1160     /* now we will go through list once more and resolve expansions,  */
   1161     /* make UCAElements structs and add them to table                 */
   1162     for(i = 0; i<src->resultLen; i++) {
   1163         /* now we need to generate the CEs */
   1164         /* We stuff the initial value in the buffers, and increase the appropriate buffer */
   1165         /* According to strength                                                          */
   1166         if(U_SUCCESS(*status)) {
   1167             ucol_createElements(src, t, &src->lh[i], status);
   1168         }
   1169     }
   1170 
   1171     UCAElements el;
   1172     el.isThai = FALSE;
   1173     el.prefixSize = 0;
   1174     el.prefixChars[0] = 0;
   1175 
   1176     /* add latin-1 stuff */
   1177     ucol_uprv_bld_copyRangeFromUCA(src, t, 0, 0xFF, status);
   1178 
   1179     /* add stuff for copying */
   1180     if(src->copySet != NULL) {
   1181         int32_t i = 0;
   1182         UnicodeSet *set = (UnicodeSet *)src->copySet;
   1183         for(i = 0; i < set->getRangeCount(); i++) {
   1184             ucol_uprv_bld_copyRangeFromUCA(src, t, set->getRangeStart(i), set->getRangeEnd(i), status);
   1185         }
   1186     }
   1187 
   1188     if(U_SUCCESS(*status)) {
   1189         /* copy contractions from the UCA - this is felt mostly for cyrillic*/
   1190 
   1191         uint32_t tailoredCE = UCOL_NOT_FOUND;
   1192         //UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->UCAConsts+sizeof(UCAConstants));
   1193         UChar *conts = (UChar *)((uint8_t *)src->UCA->image + src->UCA->image->contractionUCACombos);
   1194         UCollationElements *ucaEl = ucol_openElements(src->UCA, NULL, 0, status);
   1195         // Check for null pointer
   1196         if (ucaEl == NULL) {
   1197         	*status = U_MEMORY_ALLOCATION_ERROR;
   1198         	return NULL;
   1199         }
   1200         while(*conts != 0) {
   1201             /*tailoredCE = ucmpe32_get(t->mapping, *conts);*/
   1202             tailoredCE = utrie_get32(t->mapping, *conts, NULL);
   1203             if(tailoredCE != UCOL_NOT_FOUND) {
   1204                 UBool needToAdd = TRUE;
   1205                 if(isCntTableElement(tailoredCE)) {
   1206                     if(uprv_cnttab_isTailored(t->contractions, tailoredCE, conts+1, status) == TRUE) {
   1207                         needToAdd = FALSE;
   1208                     }
   1209                 }
   1210                 if (!needToAdd && isPrefix(tailoredCE) && *(conts+1)==0) {
   1211                     UCAElements elm;
   1212                     elm.cPoints = el.uchars;
   1213                     elm.noOfCEs = 0;
   1214                     elm.uchars[0] = *conts;
   1215                     elm.uchars[1] = 0;
   1216                     elm.cSize = 1;
   1217                     elm.prefixChars[0] = *(conts+2);
   1218                     elm.isThai = FALSE;
   1219                     elm.prefix = elm.prefixChars;
   1220                     elm.prefixSize = 1;
   1221                     UCAElements *prefixEnt=(UCAElements *)uhash_get(t->prefixLookup, &elm);
   1222                     if ((prefixEnt==NULL) || *(prefixEnt->prefix)!=*(conts+2)) {
   1223                         needToAdd = TRUE;
   1224                     }
   1225                 }
   1226                 if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
   1227                     needToAdd = FALSE;
   1228                 }
   1229 
   1230                 if(needToAdd == TRUE) { // we need to add if this contraction is not tailored.
   1231                     if (*(conts+1) != 0) {  // contractions
   1232                         el.prefix = el.prefixChars;
   1233                         el.prefixSize = 0;
   1234                         el.cPoints = el.uchars;
   1235                         el.noOfCEs = 0;
   1236                         el.uchars[0] = *conts;
   1237                         el.uchars[1] = *(conts+1);
   1238                         if(*(conts+2)!=0) {
   1239                             el.uchars[2] = *(conts+2);
   1240                             el.cSize = 3;
   1241                         } else {
   1242                             el.cSize = 2;
   1243                         }
   1244                         ucol_setText(ucaEl, el.uchars, el.cSize, status);
   1245                     }
   1246                     else { // pre-context character
   1247                         UChar str[4] = { 0 };
   1248                         int32_t len=0;
   1249                         int32_t preKeyLen=0;
   1250 
   1251                         el.cPoints = el.uchars;
   1252                         el.noOfCEs = 0;
   1253                         el.uchars[0] = *conts;
   1254                         el.uchars[1] = 0;
   1255                         el.cSize = 1;
   1256                         el.prefixChars[0] = *(conts+2);
   1257                         el.prefix = el.prefixChars;
   1258                         el.prefixSize = 1;
   1259                         if (el.prefixChars[0]!=0) {
   1260                             // get CE of prefix character first
   1261                             str[0]=el.prefixChars[0];
   1262                             str[1]=0;
   1263                             ucol_setText(ucaEl, str, 1, status);
   1264                             while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status))
   1265                                     != UCOL_NULLORDER) {
   1266                                 preKeyLen++;  // count number of keys for prefix character
   1267                             }
   1268                             str[len++] = el.prefixChars[0];
   1269                         }
   1270 
   1271                         str[len++] = el.uchars[0];
   1272                         str[len]=0;
   1273                         ucol_setText(ucaEl, str, len, status);
   1274                         // Skip the keys for prefix character, then copy the rest to el.
   1275                         while ((preKeyLen-->0) &&
   1276                                (int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
   1277                             continue;
   1278                         }
   1279 
   1280                     }
   1281                     while ((int32_t)(el.CEs[el.noOfCEs] = ucol_next(ucaEl, status)) != UCOL_NULLORDER) {
   1282                         el.noOfCEs++;
   1283                     }
   1284                     uprv_uca_addAnElement(t, &el, status);
   1285                 }
   1286 
   1287             } else if(src->removeSet != NULL && uset_contains(src->removeSet, *conts)) {
   1288                 ucol_uprv_bld_copyRangeFromUCA(src, t, *conts, *conts, status);
   1289             }
   1290             conts+=3;
   1291         }
   1292         ucol_closeElements(ucaEl);
   1293     }
   1294 
   1295     // Add completely ignorable elements
   1296     utrie_enum(&t->UCA->mapping, NULL, _processUCACompleteIgnorables, t);
   1297 
   1298     // add tailoring characters related canonical closures
   1299     uprv_uca_canonicalClosure(t, src, status);
   1300 
   1301     /* still need to produce compatibility closure */
   1302 
   1303     UCATableHeader *myData = uprv_uca_assembleTable(t, status);
   1304 
   1305     uprv_uca_closeTempTable(t);
   1306     uprv_free(image);
   1307 
   1308     return myData;
   1309 }
   1310 
   1311 U_CDECL_BEGIN
   1312 static UBool U_CALLCONV
   1313 ucol_bld_cleanup(void)
   1314 {
   1315     udata_close(invUCA_DATA_MEM);
   1316     invUCA_DATA_MEM = NULL;
   1317     _staticInvUCA = NULL;
   1318     return TRUE;
   1319 }
   1320 U_CDECL_END
   1321 
   1322 U_CAPI const InverseUCATableHeader * U_EXPORT2
   1323 ucol_initInverseUCA(UErrorCode *status)
   1324 {
   1325     if(U_FAILURE(*status)) return NULL;
   1326 
   1327     UBool needsInit;
   1328     UMTX_CHECK(NULL, (_staticInvUCA == NULL), needsInit);
   1329 
   1330     if(needsInit) {
   1331         InverseUCATableHeader *newInvUCA = NULL;
   1332         UDataMemory *result = udata_openChoice(U_ICUDATA_COLL, INVC_DATA_TYPE, INVC_DATA_NAME, isAcceptableInvUCA, NULL, status);
   1333 
   1334         if(U_FAILURE(*status)) {
   1335             if (result) {
   1336                 udata_close(result);
   1337             }
   1338             // This is not needed, as we are talking about
   1339             // memory we got from UData
   1340             //uprv_free(newInvUCA);
   1341         }
   1342 
   1343         if(result != NULL) { /* It looks like sometimes we can fail to find the data file */
   1344             newInvUCA = (InverseUCATableHeader *)udata_getMemory(result);
   1345             UCollator *UCA = ucol_initUCA(status);
   1346             // UCA versions of UCA and inverse UCA should match
   1347             if(uprv_memcmp(newInvUCA->UCAVersion, UCA->image->UCAVersion, sizeof(UVersionInfo)) != 0) {
   1348                 *status = U_INVALID_FORMAT_ERROR;
   1349                 udata_close(result);
   1350                 return NULL;
   1351             }
   1352 
   1353             umtx_lock(NULL);
   1354             if(_staticInvUCA == NULL) {
   1355                 invUCA_DATA_MEM = result;
   1356                 _staticInvUCA = newInvUCA;
   1357                 result = NULL;
   1358                 newInvUCA = NULL;
   1359             }
   1360             umtx_unlock(NULL);
   1361 
   1362             if(newInvUCA != NULL) {
   1363                 udata_close(result);
   1364                 // This is not needed, as we are talking about
   1365                 // memory we got from UData
   1366                 //uprv_free(newInvUCA);
   1367             }
   1368             else {
   1369                 ucln_i18n_registerCleanup(UCLN_I18N_UCOL_BLD, ucol_bld_cleanup);
   1370             }
   1371         }
   1372     }
   1373     return _staticInvUCA;
   1374 }
   1375 
   1376 #endif /* #if !UCONFIG_NO_COLLATION */
   1377