Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2010, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucol_wgt.c
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2001mar08
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This file contains code for allocating n collation element weights
     17 *   between two exclusive limits.
     18 *   It is used only internally by ucol_bld.
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 
     23 #if !UCONFIG_NO_COLLATION
     24 
     25 #include "ucol_imp.h"
     26 #include "ucol_wgt.h"
     27 #include "cmemory.h"
     28 #include "uarrsort.h"
     29 
     30 #ifdef UCOL_DEBUG
     31 #   include <stdio.h>
     32 #endif
     33 
     34 /* collation element weight allocation -------------------------------------- */
     35 
     36 /* helper functions for CE weights */
     37 
     38 static U_INLINE int32_t
     39 lengthOfWeight(uint32_t weight) {
     40     if((weight&0xffffff)==0) {
     41         return 1;
     42     } else if((weight&0xffff)==0) {
     43         return 2;
     44     } else if((weight&0xff)==0) {
     45         return 3;
     46     } else {
     47         return 4;
     48     }
     49 }
     50 
     51 static U_INLINE uint32_t
     52 getWeightTrail(uint32_t weight, int32_t length) {
     53     return (uint32_t)(weight>>(8*(4-length)))&0xff;
     54 }
     55 
     56 static U_INLINE uint32_t
     57 setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
     58     length=8*(4-length);
     59     return (uint32_t)((weight&(0xffffff00<<length))|(trail<<length));
     60 }
     61 
     62 static U_INLINE uint32_t
     63 getWeightByte(uint32_t weight, int32_t idx) {
     64     return getWeightTrail(weight, idx); /* same calculation */
     65 }
     66 
     67 static U_INLINE uint32_t
     68 setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
     69     uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
     70 
     71     idx*=8;
     72     mask=((uint32_t)0xffffffff)>>idx;
     73     idx=32-idx;
     74     mask|=0xffffff00<<idx;
     75     return (uint32_t)((weight&mask)|(byte<<idx));
     76 }
     77 
     78 static U_INLINE uint32_t
     79 truncateWeight(uint32_t weight, int32_t length) {
     80     return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));
     81 }
     82 
     83 static U_INLINE uint32_t
     84 incWeightTrail(uint32_t weight, int32_t length) {
     85     return (uint32_t)(weight+(1UL<<(8*(4-length))));
     86 }
     87 
     88 static U_INLINE uint32_t
     89 decWeightTrail(uint32_t weight, int32_t length) {
     90     return (uint32_t)(weight-(1UL<<(8*(4-length))));
     91 }
     92 
     93 static U_INLINE uint32_t
     94 incWeight(uint32_t weight, int32_t length, uint32_t maxByte) {
     95     uint32_t byte;
     96 
     97     for(;;) {
     98         byte=getWeightByte(weight, length);
     99         if(byte<maxByte) {
    100             return setWeightByte(weight, length, byte+1);
    101         } else {
    102             /* roll over, set this byte to UCOL_BYTE_FIRST_TAILORED and increment the previous one */
    103             weight=setWeightByte(weight, length, UCOL_BYTE_FIRST_TAILORED);
    104             --length;
    105         }
    106     }
    107 }
    108 
    109 static U_INLINE int32_t
    110 lengthenRange(WeightRange *range, uint32_t maxByte, uint32_t countBytes) {
    111     int32_t length;
    112 
    113     length=range->length2+1;
    114     range->start=setWeightTrail(range->start, length, UCOL_BYTE_FIRST_TAILORED);
    115     range->end=setWeightTrail(range->end, length, maxByte);
    116     range->count2*=countBytes;
    117     range->length2=length;
    118     return length;
    119 }
    120 
    121 /* for uprv_sortArray: sort ranges in weight order */
    122 static int32_t U_CALLCONV
    123 compareRanges(const void * /*context*/, const void *left, const void *right) {
    124     uint32_t l, r;
    125 
    126     l=((const WeightRange *)left)->start;
    127     r=((const WeightRange *)right)->start;
    128     if(l<r) {
    129         return -1;
    130     } else if(l>r) {
    131         return 1;
    132     } else {
    133         return 0;
    134     }
    135 }
    136 
    137 /*
    138  * take two CE weights and calculate the
    139  * possible ranges of weights between the two limits, excluding them
    140  * for weights with up to 4 bytes there are up to 2*4-1=7 ranges
    141  */
    142 static U_INLINE int32_t
    143 getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit,
    144                 uint32_t maxByte, uint32_t countBytes,
    145                 WeightRange ranges[7]) {
    146     WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
    147     uint32_t weight, trail;
    148     int32_t length, lowerLength, upperLength, rangeCount;
    149 
    150     /* assume that both lowerLimit & upperLimit are not 0 */
    151 
    152     /* get the lengths of the limits */
    153     lowerLength=lengthOfWeight(lowerLimit);
    154     upperLength=lengthOfWeight(upperLimit);
    155 
    156 #ifdef UCOL_DEBUG
    157     printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
    158     printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
    159 #endif
    160 
    161     if(lowerLimit>=upperLimit) {
    162 #ifdef UCOL_DEBUG
    163         printf("error: no space between lower & upper limits\n");
    164 #endif
    165         return 0;
    166     }
    167 
    168     /* check that neither is a prefix of the other */
    169     if(lowerLength<upperLength) {
    170         if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
    171 #ifdef UCOL_DEBUG
    172             printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
    173 #endif
    174             return 0;
    175         }
    176     }
    177     /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
    178 
    179     /* reset local variables */
    180     uprv_memset(lower, 0, sizeof(lower));
    181     uprv_memset(&middle, 0, sizeof(middle));
    182     uprv_memset(upper, 0, sizeof(upper));
    183 
    184     /*
    185      * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
    186      * range     minimum length
    187      * lower[4]  4
    188      * lower[3]  3
    189      * lower[2]  2
    190      * middle    1
    191      * upper[2]  2
    192      * upper[3]  3
    193      * upper[4]  4
    194      *
    195      * We are now going to calculate up to 7 ranges.
    196      * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
    197      */
    198     weight=lowerLimit;
    199     for(length=lowerLength; length>=2; --length) {
    200         trail=getWeightTrail(weight, length);
    201         if(trail<maxByte) {
    202             lower[length].start=incWeightTrail(weight, length);
    203             lower[length].end=setWeightTrail(weight, length, maxByte);
    204             lower[length].length=length;
    205             lower[length].count=maxByte-trail;
    206         }
    207         weight=truncateWeight(weight, length-1);
    208     }
    209     middle.start=incWeightTrail(weight, 1);
    210 
    211     weight=upperLimit;
    212     for(length=upperLength; length>=2; --length) {
    213         trail=getWeightTrail(weight, length);
    214         if(trail>UCOL_BYTE_FIRST_TAILORED) {
    215             upper[length].start=setWeightTrail(weight, length, UCOL_BYTE_FIRST_TAILORED);
    216             upper[length].end=decWeightTrail(weight, length);
    217             upper[length].length=length;
    218             upper[length].count=trail-UCOL_BYTE_FIRST_TAILORED;
    219         }
    220         weight=truncateWeight(weight, length-1);
    221     }
    222     middle.end=decWeightTrail(weight, 1);
    223 
    224     /* set the middle range */
    225     middle.length=1;
    226     if(middle.end>=middle.start) {
    227         middle.count=(int32_t)((middle.end-middle.start)>>24)+1;
    228     } else {
    229         /* eliminate overlaps */
    230         uint32_t start, end;
    231 
    232         /* remove the middle range */
    233         middle.count=0;
    234 
    235         /* reduce or remove the lower ranges that go beyond upperLimit */
    236         for(length=4; length>=2; --length) {
    237             if(lower[length].count>0 && upper[length].count>0) {
    238                 start=upper[length].start;
    239                 end=lower[length].end;
    240 
    241                 if(end>=start || incWeight(end, length, maxByte)==start) {
    242                     /* lower and upper ranges collide or are directly adjacent: merge these two and remove all shorter ranges */
    243                     start=lower[length].start;
    244                     end=lower[length].end=upper[length].end;
    245                     /*
    246                      * merging directly adjacent ranges needs to subtract the 0/1 gaps in between;
    247                      * it may result in a range with count>countBytes
    248                      */
    249                     lower[length].count=
    250                         (int32_t)(getWeightTrail(end, length)-getWeightTrail(start, length)+1+
    251                                   countBytes*(getWeightByte(end, length-1)-getWeightByte(start, length-1)));
    252                     upper[length].count=0;
    253                     while(--length>=2) {
    254                         lower[length].count=upper[length].count=0;
    255                     }
    256                     break;
    257                 }
    258             }
    259         }
    260     }
    261 
    262 #ifdef UCOL_DEBUG
    263     /* print ranges */
    264     for(length=4; length>=2; --length) {
    265         if(lower[length].count>0) {
    266             printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
    267         }
    268     }
    269     if(middle.count>0) {
    270         printf("middle   .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
    271     }
    272     for(length=2; length<=4; ++length) {
    273         if(upper[length].count>0) {
    274             printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
    275         }
    276     }
    277 #endif
    278 
    279     /* copy the ranges, shortest first, into the result array */
    280     rangeCount=0;
    281     if(middle.count>0) {
    282         uprv_memcpy(ranges, &middle, sizeof(WeightRange));
    283         rangeCount=1;
    284     }
    285     for(length=2; length<=4; ++length) {
    286         /* copy upper first so that later the middle range is more likely the first one to use */
    287         if(upper[length].count>0) {
    288             uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
    289             ++rangeCount;
    290         }
    291         if(lower[length].count>0) {
    292             uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
    293             ++rangeCount;
    294         }
    295     }
    296     return rangeCount;
    297 }
    298 
    299 /*
    300  * call getWeightRanges and then determine heuristically
    301  * which ranges to use for a given number of weights between (excluding)
    302  * two limits
    303  */
    304 U_CFUNC int32_t
    305 ucol_allocWeights(uint32_t lowerLimit, uint32_t upperLimit,
    306                   uint32_t n,
    307                   uint32_t maxByte,
    308                   WeightRange ranges[7]) {
    309     /* number of usable byte values 3..maxByte */
    310     uint32_t countBytes=maxByte-UCOL_BYTE_FIRST_TAILORED+1;
    311 
    312     uint32_t lengthCounts[6]; /* [0] unused, [5] to make index checks unnecessary */
    313     uint32_t maxCount;
    314     int32_t i, rangeCount, minLength/*, maxLength*/;
    315 
    316     /* countBytes to the power of index */
    317     uint32_t powers[5];
    318     /* gcc requires explicit initialization */
    319     powers[0] = 1;
    320     powers[1] = countBytes;
    321     powers[2] = countBytes*countBytes;
    322     powers[3] = countBytes*countBytes*countBytes;
    323     powers[4] = countBytes*countBytes*countBytes*countBytes;
    324 
    325 #ifdef UCOL_DEBUG
    326     puts("");
    327 #endif
    328 
    329     rangeCount=getWeightRanges(lowerLimit, upperLimit, maxByte, countBytes, ranges);
    330     if(rangeCount<=0) {
    331 #ifdef UCOL_DEBUG
    332         printf("error: unable to get Weight ranges\n");
    333 #endif
    334         return 0;
    335     }
    336 
    337     /* what is the maximum number of weights with these ranges? */
    338     maxCount=0;
    339     for(i=0; i<rangeCount; ++i) {
    340         maxCount+=(uint32_t)ranges[i].count*powers[4-ranges[i].length];
    341     }
    342     if(maxCount>=n) {
    343 #ifdef UCOL_DEBUG
    344         printf("the maximum number of %lu weights is sufficient for n=%lu\n", maxCount, n);
    345 #endif
    346     } else {
    347 #ifdef UCOL_DEBUG
    348         printf("error: the maximum number of %lu weights is insufficient for n=%lu\n", maxCount, n);
    349 #endif
    350         return 0;
    351     }
    352 
    353     /* set the length2 and count2 fields */
    354     for(i=0; i<rangeCount; ++i) {
    355         ranges[i].length2=ranges[i].length;
    356         ranges[i].count2=(uint32_t)ranges[i].count;
    357     }
    358 
    359     /* try until we find suitably large ranges */
    360     for(;;) {
    361         /* get the smallest number of bytes in a range */
    362         minLength=ranges[0].length2;
    363 
    364         /* sum up the number of elements that fit into ranges of each byte length */
    365         uprv_memset(lengthCounts, 0, sizeof(lengthCounts));
    366         for(i=0; i<rangeCount; ++i) {
    367             lengthCounts[ranges[i].length2]+=ranges[i].count2;
    368         }
    369 
    370         /* now try to allocate n elements in the available short ranges */
    371         if(n<=(lengthCounts[minLength]+lengthCounts[minLength+1])) {
    372             /* trivial cases, use the first few ranges */
    373             maxCount=0;
    374             rangeCount=0;
    375             do {
    376                 maxCount+=ranges[rangeCount].count2;
    377                 ++rangeCount;
    378             } while(n>maxCount);
    379 #ifdef UCOL_DEBUG
    380             printf("take first %ld ranges\n", rangeCount);
    381 #endif
    382             break;
    383         } else if(n<=ranges[0].count2*countBytes) {
    384             /* easy case, just make this one range large enough by lengthening it once more, possibly split it */
    385             uint32_t count1, count2, power_1, power;
    386 
    387             /*maxLength=minLength+1;*/
    388 
    389             /* calculate how to split the range between maxLength-1 (count1) and maxLength (count2) */
    390             power_1=powers[minLength-ranges[0].length];
    391             power=power_1*countBytes;
    392             count2=(n+power-1)/power;
    393             count1=ranges[0].count-count2;
    394 
    395             /* split the range */
    396 #ifdef UCOL_DEBUG
    397             printf("split the first range %ld:%ld\n", count1, count2);
    398 #endif
    399             if(count1<1) {
    400                 rangeCount=1;
    401 
    402                 /* lengthen the entire range to maxLength */
    403                 lengthenRange(ranges, maxByte, countBytes);
    404             } else {
    405                 /* really split the range */
    406                 uint32_t byte;
    407 
    408                 /* create a new range with the end and initial and current length of the old one */
    409                 rangeCount=2;
    410                 ranges[1].end=ranges[0].end;
    411                 ranges[1].length=ranges[0].length;
    412                 ranges[1].length2=minLength;
    413 
    414                 /* set the end of the first range according to count1 */
    415                 i=ranges[0].length;
    416                 byte=getWeightByte(ranges[0].start, i)+count1-1;
    417 
    418                 /*
    419                  * ranges[0].count and count1 may be >countBytes
    420                  * from merging adjacent ranges;
    421                  * byte>maxByte is possible
    422                  */
    423                 if(byte<=maxByte) {
    424                     ranges[0].end=setWeightByte(ranges[0].start, i, byte);
    425                 } else /* byte>maxByte */ {
    426                     ranges[0].end=setWeightByte(incWeight(ranges[0].start, i-1, maxByte), i, byte-countBytes);
    427                 }
    428 
    429                 /* set the bytes in the end weight at length+1..length2 to maxByte */
    430                 byte=(maxByte<<24)|(maxByte<<16)|(maxByte<<8)|maxByte; /* this used to be 0xffffffff */
    431                 ranges[0].end=truncateWeight(ranges[0].end, i)|
    432                               ((byte>>(8*i))&(byte<<(8*(4-minLength))));
    433 
    434                 /* set the start of the second range to immediately follow the end of the first one */
    435                 ranges[1].start=incWeight(ranges[0].end, minLength, maxByte);
    436 
    437                 /* set the count values (informational) */
    438                 ranges[0].count=count1;
    439                 ranges[1].count=count2;
    440 
    441                 ranges[0].count2=count1*power_1;
    442                 ranges[1].count2=count2*power_1; /* will be *countBytes when lengthened */
    443 
    444                 /* lengthen the second range to maxLength */
    445                 lengthenRange(ranges+1, maxByte, countBytes);
    446             }
    447             break;
    448         }
    449 
    450         /* no good match, lengthen all minLength ranges and iterate */
    451 #ifdef UCOL_DEBUG
    452         printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
    453 #endif
    454         for(i=0; ranges[i].length2==minLength; ++i) {
    455             lengthenRange(ranges+i, maxByte, countBytes);
    456         }
    457     }
    458 
    459     if(rangeCount>1) {
    460         /* sort the ranges by weight values */
    461         UErrorCode errorCode=U_ZERO_ERROR;
    462         uprv_sortArray(ranges, rangeCount, sizeof(WeightRange), compareRanges, NULL, FALSE, &errorCode);
    463         /* ignore error code: we know that the internal sort function will not fail here */
    464     }
    465 
    466 #ifdef UCOL_DEBUG
    467     puts("final ranges:");
    468     for(i=0; i<rangeCount; ++i) {
    469         printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .length2=%ld .count=%ld .count2=%lu\n",
    470                i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].length2, ranges[i].count, ranges[i].count2);
    471     }
    472 #endif
    473 
    474     /* set maxByte in ranges[0] for ucol_nextWeight() */
    475     ranges[0].count=maxByte;
    476 
    477     return rangeCount;
    478 }
    479 
    480 /*
    481  * given a set of ranges calculated by ucol_allocWeights(),
    482  * iterate through the weights
    483  */
    484 U_CFUNC uint32_t
    485 ucol_nextWeight(WeightRange ranges[], int32_t *pRangeCount) {
    486     if(*pRangeCount<=0) {
    487         return 0xffffffff;
    488     } else {
    489         uint32_t weight, maxByte;
    490 
    491         /* get maxByte from the .count field */
    492         maxByte=ranges[0].count;
    493 
    494         /* get the next weight */
    495         weight=ranges[0].start;
    496         if(weight==ranges[0].end) {
    497             /* this range is finished, remove it and move the following ones up */
    498             if(--*pRangeCount>0) {
    499                 uprv_memmove(ranges, ranges+1, *pRangeCount*sizeof(WeightRange));
    500                 ranges[0].count=maxByte; /* keep maxByte in ranges[0] */
    501             }
    502         } else {
    503             /* increment the weight for the next value */
    504             ranges[0].start=incWeight(weight, ranges[0].length2, maxByte);
    505         }
    506 
    507         return weight;
    508     }
    509 }
    510 
    511 #if 0 // #ifdef UCOL_DEBUG
    512 
    513 static void
    514 testAlloc(uint32_t lowerLimit, uint32_t upperLimit, uint32_t n, UBool enumerate) {
    515     WeightRange ranges[8];
    516     int32_t rangeCount;
    517 
    518     rangeCount=ucol_allocWeights(lowerLimit, upperLimit, n, ranges);
    519     if(enumerate) {
    520         uint32_t weight;
    521 
    522         while(n>0) {
    523             weight=ucol_nextWeight(ranges, &rangeCount);
    524             if(weight==0xffffffff) {
    525                 printf("error: 0xffffffff with %lu more weights to go\n", n);
    526                 break;
    527             }
    528             printf("    0x%08lx\n", weight);
    529             --n;
    530         }
    531     }
    532 }
    533 
    534 extern int
    535 main(int argc, const char *argv[]) {
    536 #if 0
    537 #endif
    538     testAlloc(0x364214fc, 0x44b87d23, 5, FALSE);
    539     testAlloc(0x36421500, 0x44b87d23, 5, FALSE);
    540     testAlloc(0x36421500, 0x44b87d23, 20, FALSE);
    541     testAlloc(0x36421500, 0x44b87d23, 13700, FALSE);
    542     testAlloc(0x36421500, 0x38b87d23, 1, FALSE);
    543     testAlloc(0x36421500, 0x38b87d23, 20, FALSE);
    544     testAlloc(0x36421500, 0x38b87d23, 200, TRUE);
    545     testAlloc(0x36421500, 0x38b87d23, 13700, FALSE);
    546     testAlloc(0x36421500, 0x37b87d23, 13700, FALSE);
    547     testAlloc(0x36ef1500, 0x37b87d23, 13700, FALSE);
    548     testAlloc(0x36421500, 0x36b87d23, 13700, FALSE);
    549     testAlloc(0x36b87122, 0x36b87d23, 13700, FALSE);
    550     testAlloc(0x49000000, 0x4a600000, 13700, FALSE);
    551     testAlloc(0x9fffffff, 0xd0000000, 13700, FALSE);
    552     testAlloc(0x9fffffff, 0xd0000000, 67400, FALSE);
    553     testAlloc(0x9fffffff, 0xa0030000, 67400, FALSE);
    554     testAlloc(0x9fffffff, 0xa0030000, 40000, FALSE);
    555     testAlloc(0xa0000000, 0xa0030000, 40000, FALSE);
    556     testAlloc(0xa0031100, 0xa0030000, 40000, FALSE);
    557 #if 0
    558 #endif
    559     return 0;
    560 }
    561 
    562 #endif
    563 
    564 #endif /* #if !UCONFIG_NO_COLLATION */
    565