Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 *
      4 *   Copyright (C) 1999-2011, International Business Machines
      5 *   Corporation and others.  All Rights Reserved.
      6 *
      7 *******************************************************************************
      8 *   file name:  ucol_wgt.cpp
      9 *   encoding:   US-ASCII
     10 *   tab size:   8 (not used)
     11 *   indentation:4
     12 *
     13 *   created on: 2001mar08
     14 *   created by: Markus W. Scherer
     15 *
     16 *   This file contains code for allocating n collation element weights
     17 *   between two exclusive limits.
     18 *   It is used only internally by ucol_bld.
     19 */
     20 
     21 #include "unicode/utypes.h"
     22 
     23 #if !UCONFIG_NO_COLLATION
     24 
     25 #include "ucol_imp.h"
     26 #include "ucol_wgt.h"
     27 #include "cmemory.h"
     28 #include "uarrsort.h"
     29 
     30 #ifdef UCOL_DEBUG
     31 #   include <stdio.h>
     32 #endif
     33 
     34 /* collation element weight allocation -------------------------------------- */
     35 
     36 /* helper functions for CE weights */
     37 
     38 static inline int32_t
     39 lengthOfWeight(uint32_t weight) {
     40     if((weight&0xffffff)==0) {
     41         return 1;
     42     } else if((weight&0xffff)==0) {
     43         return 2;
     44     } else if((weight&0xff)==0) {
     45         return 3;
     46     } else {
     47         return 4;
     48     }
     49 }
     50 
     51 static inline uint32_t
     52 getWeightTrail(uint32_t weight, int32_t length) {
     53     return (uint32_t)(weight>>(8*(4-length)))&0xff;
     54 }
     55 
     56 static inline uint32_t
     57 setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) {
     58     length=8*(4-length);
     59     return (uint32_t)((weight&(0xffffff00<<length))|(trail<<length));
     60 }
     61 
     62 static inline uint32_t
     63 getWeightByte(uint32_t weight, int32_t idx) {
     64     return getWeightTrail(weight, idx); /* same calculation */
     65 }
     66 
     67 static inline uint32_t
     68 setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) {
     69     uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */
     70 
     71     idx*=8;
     72     if(idx<32) {
     73         mask=((uint32_t)0xffffffff)>>idx;
     74     } else {
     75         // Do not use uint32_t>>32 because on some platforms that does not shift at all
     76         // while we need it to become 0.
     77         // PowerPC: 0xffffffff>>32 = 0           (wanted)
     78         // x86:     0xffffffff>>32 = 0xffffffff  (not wanted)
     79         //
     80         // ANSI C99 6.5.7 Bitwise shift operators:
     81         // "If the value of the right operand is negative
     82         // or is greater than or equal to the width of the promoted left operand,
     83         // the behavior is undefined."
     84         mask=0;
     85     }
     86     idx=32-idx;
     87     mask|=0xffffff00<<idx;
     88     return (uint32_t)((weight&mask)|(byte<<idx));
     89 }
     90 
     91 static inline uint32_t
     92 truncateWeight(uint32_t weight, int32_t length) {
     93     return (uint32_t)(weight&(0xffffffff<<(8*(4-length))));
     94 }
     95 
     96 static inline uint32_t
     97 incWeightTrail(uint32_t weight, int32_t length) {
     98     return (uint32_t)(weight+(1UL<<(8*(4-length))));
     99 }
    100 
    101 static inline uint32_t
    102 decWeightTrail(uint32_t weight, int32_t length) {
    103     return (uint32_t)(weight-(1UL<<(8*(4-length))));
    104 }
    105 
    106 static inline uint32_t
    107 incWeight(uint32_t weight, int32_t length, uint32_t maxByte) {
    108     uint32_t byte;
    109 
    110     for(;;) {
    111         byte=getWeightByte(weight, length);
    112         if(byte<maxByte) {
    113             return setWeightByte(weight, length, byte+1);
    114         } else {
    115             /* roll over, set this byte to UCOL_BYTE_FIRST_TAILORED and increment the previous one */
    116             weight=setWeightByte(weight, length, UCOL_BYTE_FIRST_TAILORED);
    117             --length;
    118         }
    119     }
    120 }
    121 
    122 static inline int32_t
    123 lengthenRange(WeightRange *range, uint32_t maxByte, uint32_t countBytes) {
    124     int32_t length;
    125 
    126     length=range->length2+1;
    127     range->start=setWeightTrail(range->start, length, UCOL_BYTE_FIRST_TAILORED);
    128     range->end=setWeightTrail(range->end, length, maxByte);
    129     range->count2*=countBytes;
    130     range->length2=length;
    131     return length;
    132 }
    133 
    134 /* for uprv_sortArray: sort ranges in weight order */
    135 static int32_t U_CALLCONV
    136 compareRanges(const void * /*context*/, const void *left, const void *right) {
    137     uint32_t l, r;
    138 
    139     l=((const WeightRange *)left)->start;
    140     r=((const WeightRange *)right)->start;
    141     if(l<r) {
    142         return -1;
    143     } else if(l>r) {
    144         return 1;
    145     } else {
    146         return 0;
    147     }
    148 }
    149 
    150 /*
    151  * take two CE weights and calculate the
    152  * possible ranges of weights between the two limits, excluding them
    153  * for weights with up to 4 bytes there are up to 2*4-1=7 ranges
    154  */
    155 static inline int32_t
    156 getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit,
    157                 uint32_t maxByte, uint32_t countBytes,
    158                 WeightRange ranges[7]) {
    159     WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */
    160     uint32_t weight, trail;
    161     int32_t length, lowerLength, upperLength, rangeCount;
    162 
    163     /* assume that both lowerLimit & upperLimit are not 0 */
    164 
    165     /* get the lengths of the limits */
    166     lowerLength=lengthOfWeight(lowerLimit);
    167     upperLength=lengthOfWeight(upperLimit);
    168 
    169 #ifdef UCOL_DEBUG
    170     printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength);
    171     printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength);
    172 #endif
    173 
    174     if(lowerLimit>=upperLimit) {
    175 #ifdef UCOL_DEBUG
    176         printf("error: no space between lower & upper limits\n");
    177 #endif
    178         return 0;
    179     }
    180 
    181     /* check that neither is a prefix of the other */
    182     if(lowerLength<upperLength) {
    183         if(lowerLimit==truncateWeight(upperLimit, lowerLength)) {
    184 #ifdef UCOL_DEBUG
    185             printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit);
    186 #endif
    187             return 0;
    188         }
    189     }
    190     /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */
    191 
    192     /* reset local variables */
    193     uprv_memset(lower, 0, sizeof(lower));
    194     uprv_memset(&middle, 0, sizeof(middle));
    195     uprv_memset(upper, 0, sizeof(upper));
    196 
    197     /*
    198      * With the limit lengths of 1..4, there are up to 7 ranges for allocation:
    199      * range     minimum length
    200      * lower[4]  4
    201      * lower[3]  3
    202      * lower[2]  2
    203      * middle    1
    204      * upper[2]  2
    205      * upper[3]  3
    206      * upper[4]  4
    207      *
    208      * We are now going to calculate up to 7 ranges.
    209      * Some of them will typically overlap, so we will then have to merge and eliminate ranges.
    210      */
    211     weight=lowerLimit;
    212     for(length=lowerLength; length>=2; --length) {
    213         trail=getWeightTrail(weight, length);
    214         if(trail<maxByte) {
    215             lower[length].start=incWeightTrail(weight, length);
    216             lower[length].end=setWeightTrail(weight, length, maxByte);
    217             lower[length].length=length;
    218             lower[length].count=maxByte-trail;
    219         }
    220         weight=truncateWeight(weight, length-1);
    221     }
    222     middle.start=incWeightTrail(weight, 1);
    223 
    224     weight=upperLimit;
    225     for(length=upperLength; length>=2; --length) {
    226         trail=getWeightTrail(weight, length);
    227         if(trail>UCOL_BYTE_FIRST_TAILORED) {
    228             upper[length].start=setWeightTrail(weight, length, UCOL_BYTE_FIRST_TAILORED);
    229             upper[length].end=decWeightTrail(weight, length);
    230             upper[length].length=length;
    231             upper[length].count=trail-UCOL_BYTE_FIRST_TAILORED;
    232         }
    233         weight=truncateWeight(weight, length-1);
    234     }
    235     middle.end=decWeightTrail(weight, 1);
    236 
    237     /* set the middle range */
    238     middle.length=1;
    239     if(middle.end>=middle.start) {
    240         middle.count=(int32_t)((middle.end-middle.start)>>24)+1;
    241     } else {
    242         /* eliminate overlaps */
    243         uint32_t start, end;
    244 
    245         /* remove the middle range */
    246         middle.count=0;
    247 
    248         /* reduce or remove the lower ranges that go beyond upperLimit */
    249         for(length=4; length>=2; --length) {
    250             if(lower[length].count>0 && upper[length].count>0) {
    251                 start=upper[length].start;
    252                 end=lower[length].end;
    253 
    254                 if(end>=start || incWeight(end, length, maxByte)==start) {
    255                     /* lower and upper ranges collide or are directly adjacent: merge these two and remove all shorter ranges */
    256                     start=lower[length].start;
    257                     end=lower[length].end=upper[length].end;
    258                     /*
    259                      * merging directly adjacent ranges needs to subtract the 0/1 gaps in between;
    260                      * it may result in a range with count>countBytes
    261                      */
    262                     lower[length].count=
    263                         (int32_t)(getWeightTrail(end, length)-getWeightTrail(start, length)+1+
    264                                   countBytes*(getWeightByte(end, length-1)-getWeightByte(start, length-1)));
    265                     upper[length].count=0;
    266                     while(--length>=2) {
    267                         lower[length].count=upper[length].count=0;
    268                     }
    269                     break;
    270                 }
    271             }
    272         }
    273     }
    274 
    275 #ifdef UCOL_DEBUG
    276     /* print ranges */
    277     for(length=4; length>=2; --length) {
    278         if(lower[length].count>0) {
    279             printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count);
    280         }
    281     }
    282     if(middle.count>0) {
    283         printf("middle   .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count);
    284     }
    285     for(length=2; length<=4; ++length) {
    286         if(upper[length].count>0) {
    287             printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count);
    288         }
    289     }
    290 #endif
    291 
    292     /* copy the ranges, shortest first, into the result array */
    293     rangeCount=0;
    294     if(middle.count>0) {
    295         uprv_memcpy(ranges, &middle, sizeof(WeightRange));
    296         rangeCount=1;
    297     }
    298     for(length=2; length<=4; ++length) {
    299         /* copy upper first so that later the middle range is more likely the first one to use */
    300         if(upper[length].count>0) {
    301             uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange));
    302             ++rangeCount;
    303         }
    304         if(lower[length].count>0) {
    305             uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange));
    306             ++rangeCount;
    307         }
    308     }
    309     return rangeCount;
    310 }
    311 
    312 /*
    313  * call getWeightRanges and then determine heuristically
    314  * which ranges to use for a given number of weights between (excluding)
    315  * two limits
    316  */
    317 U_CFUNC int32_t
    318 ucol_allocWeights(uint32_t lowerLimit, uint32_t upperLimit,
    319                   uint32_t n,
    320                   uint32_t maxByte,
    321                   WeightRange ranges[7]) {
    322     /* number of usable byte values 3..maxByte */
    323     uint32_t countBytes=maxByte-UCOL_BYTE_FIRST_TAILORED+1;
    324 
    325     uint32_t lengthCounts[6]; /* [0] unused, [5] to make index checks unnecessary */
    326     uint32_t maxCount;
    327     int32_t i, rangeCount, minLength/*, maxLength*/;
    328 
    329     /* countBytes to the power of index */
    330     uint32_t powers[5];
    331     /* gcc requires explicit initialization */
    332     powers[0] = 1;
    333     powers[1] = countBytes;
    334     powers[2] = countBytes*countBytes;
    335     powers[3] = countBytes*countBytes*countBytes;
    336     powers[4] = countBytes*countBytes*countBytes*countBytes;
    337 
    338 #ifdef UCOL_DEBUG
    339     puts("");
    340 #endif
    341 
    342     rangeCount=getWeightRanges(lowerLimit, upperLimit, maxByte, countBytes, ranges);
    343     if(rangeCount<=0) {
    344 #ifdef UCOL_DEBUG
    345         printf("error: unable to get Weight ranges\n");
    346 #endif
    347         return 0;
    348     }
    349 
    350     /* what is the maximum number of weights with these ranges? */
    351     maxCount=0;
    352     for(i=0; i<rangeCount; ++i) {
    353         maxCount+=(uint32_t)ranges[i].count*powers[4-ranges[i].length];
    354     }
    355     if(maxCount>=n) {
    356 #ifdef UCOL_DEBUG
    357         printf("the maximum number of %lu weights is sufficient for n=%lu\n", maxCount, n);
    358 #endif
    359     } else {
    360 #ifdef UCOL_DEBUG
    361         printf("error: the maximum number of %lu weights is insufficient for n=%lu\n", maxCount, n);
    362 #endif
    363         return 0;
    364     }
    365 
    366     /* set the length2 and count2 fields */
    367     for(i=0; i<rangeCount; ++i) {
    368         ranges[i].length2=ranges[i].length;
    369         ranges[i].count2=(uint32_t)ranges[i].count;
    370     }
    371 
    372     /* try until we find suitably large ranges */
    373     for(;;) {
    374         /* get the smallest number of bytes in a range */
    375         minLength=ranges[0].length2;
    376 
    377         /* sum up the number of elements that fit into ranges of each byte length */
    378         uprv_memset(lengthCounts, 0, sizeof(lengthCounts));
    379         for(i=0; i<rangeCount; ++i) {
    380             lengthCounts[ranges[i].length2]+=ranges[i].count2;
    381         }
    382 
    383         /* now try to allocate n elements in the available short ranges */
    384         if(n<=(lengthCounts[minLength]+lengthCounts[minLength+1])) {
    385             /* trivial cases, use the first few ranges */
    386             maxCount=0;
    387             rangeCount=0;
    388             do {
    389                 maxCount+=ranges[rangeCount].count2;
    390                 ++rangeCount;
    391             } while(n>maxCount);
    392 #ifdef UCOL_DEBUG
    393             printf("take first %ld ranges\n", rangeCount);
    394 #endif
    395             break;
    396         } else if(n<=ranges[0].count2*countBytes) {
    397             /* easy case, just make this one range large enough by lengthening it once more, possibly split it */
    398             uint32_t count1, count2, power_1, power;
    399 
    400             /*maxLength=minLength+1;*/
    401 
    402             /* calculate how to split the range between maxLength-1 (count1) and maxLength (count2) */
    403             power_1=powers[minLength-ranges[0].length];
    404             power=power_1*countBytes;
    405             count2=(n+power-1)/power;
    406             count1=ranges[0].count-count2;
    407 
    408             /* split the range */
    409 #ifdef UCOL_DEBUG
    410             printf("split the first range %ld:%ld\n", count1, count2);
    411 #endif
    412             if(count1<1) {
    413                 rangeCount=1;
    414 
    415                 /* lengthen the entire range to maxLength */
    416                 lengthenRange(ranges, maxByte, countBytes);
    417             } else {
    418                 /* really split the range */
    419                 uint32_t byte;
    420 
    421                 /* create a new range with the end and initial and current length of the old one */
    422                 rangeCount=2;
    423                 ranges[1].end=ranges[0].end;
    424                 ranges[1].length=ranges[0].length;
    425                 ranges[1].length2=minLength;
    426 
    427                 /* set the end of the first range according to count1 */
    428                 i=ranges[0].length;
    429                 byte=getWeightByte(ranges[0].start, i)+count1-1;
    430 
    431                 /*
    432                  * ranges[0].count and count1 may be >countBytes
    433                  * from merging adjacent ranges;
    434                  * byte>maxByte is possible
    435                  */
    436                 if(byte<=maxByte) {
    437                     ranges[0].end=setWeightByte(ranges[0].start, i, byte);
    438                 } else /* byte>maxByte */ {
    439                     ranges[0].end=setWeightByte(incWeight(ranges[0].start, i-1, maxByte), i, byte-countBytes);
    440                 }
    441 
    442                 /* set the bytes in the end weight at length+1..length2 to maxByte */
    443                 byte=(maxByte<<24)|(maxByte<<16)|(maxByte<<8)|maxByte; /* this used to be 0xffffffff */
    444                 ranges[0].end=truncateWeight(ranges[0].end, i)|
    445                               ((byte>>(8*i))&(byte<<(8*(4-minLength))));
    446 
    447                 /* set the start of the second range to immediately follow the end of the first one */
    448                 ranges[1].start=incWeight(ranges[0].end, minLength, maxByte);
    449 
    450                 /* set the count values (informational) */
    451                 ranges[0].count=count1;
    452                 ranges[1].count=count2;
    453 
    454                 ranges[0].count2=count1*power_1;
    455                 ranges[1].count2=count2*power_1; /* will be *countBytes when lengthened */
    456 
    457                 /* lengthen the second range to maxLength */
    458                 lengthenRange(ranges+1, maxByte, countBytes);
    459             }
    460             break;
    461         }
    462 
    463         /* no good match, lengthen all minLength ranges and iterate */
    464 #ifdef UCOL_DEBUG
    465         printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1);
    466 #endif
    467         for(i=0; ranges[i].length2==minLength; ++i) {
    468             lengthenRange(ranges+i, maxByte, countBytes);
    469         }
    470     }
    471 
    472     if(rangeCount>1) {
    473         /* sort the ranges by weight values */
    474         UErrorCode errorCode=U_ZERO_ERROR;
    475         uprv_sortArray(ranges, rangeCount, sizeof(WeightRange), compareRanges, NULL, FALSE, &errorCode);
    476         /* ignore error code: we know that the internal sort function will not fail here */
    477     }
    478 
    479 #ifdef UCOL_DEBUG
    480     puts("final ranges:");
    481     for(i=0; i<rangeCount; ++i) {
    482         printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .length2=%ld .count=%ld .count2=%lu\n",
    483                i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].length2, ranges[i].count, ranges[i].count2);
    484     }
    485 #endif
    486 
    487     /* set maxByte in ranges[0] for ucol_nextWeight() */
    488     ranges[0].count=maxByte;
    489 
    490     return rangeCount;
    491 }
    492 
    493 /*
    494  * given a set of ranges calculated by ucol_allocWeights(),
    495  * iterate through the weights
    496  */
    497 U_CFUNC uint32_t
    498 ucol_nextWeight(WeightRange ranges[], int32_t *pRangeCount) {
    499     if(*pRangeCount<=0) {
    500         return 0xffffffff;
    501     } else {
    502         uint32_t weight, maxByte;
    503 
    504         /* get maxByte from the .count field */
    505         maxByte=ranges[0].count;
    506 
    507         /* get the next weight */
    508         weight=ranges[0].start;
    509         if(weight==ranges[0].end) {
    510             /* this range is finished, remove it and move the following ones up */
    511             if(--*pRangeCount>0) {
    512                 uprv_memmove(ranges, ranges+1, *pRangeCount*sizeof(WeightRange));
    513                 ranges[0].count=maxByte; /* keep maxByte in ranges[0] */
    514             }
    515         } else {
    516             /* increment the weight for the next value */
    517             ranges[0].start=incWeight(weight, ranges[0].length2, maxByte);
    518         }
    519 
    520         return weight;
    521     }
    522 }
    523 
    524 #if 0 // #ifdef UCOL_DEBUG
    525 
    526 static void
    527 testAlloc(uint32_t lowerLimit, uint32_t upperLimit, uint32_t n, UBool enumerate) {
    528     WeightRange ranges[8];
    529     int32_t rangeCount;
    530 
    531     rangeCount=ucol_allocWeights(lowerLimit, upperLimit, n, ranges);
    532     if(enumerate) {
    533         uint32_t weight;
    534 
    535         while(n>0) {
    536             weight=ucol_nextWeight(ranges, &rangeCount);
    537             if(weight==0xffffffff) {
    538                 printf("error: 0xffffffff with %lu more weights to go\n", n);
    539                 break;
    540             }
    541             printf("    0x%08lx\n", weight);
    542             --n;
    543         }
    544     }
    545 }
    546 
    547 extern int
    548 main(int argc, const char *argv[]) {
    549 #if 0
    550 #endif
    551     testAlloc(0x364214fc, 0x44b87d23, 5, FALSE);
    552     testAlloc(0x36421500, 0x44b87d23, 5, FALSE);
    553     testAlloc(0x36421500, 0x44b87d23, 20, FALSE);
    554     testAlloc(0x36421500, 0x44b87d23, 13700, FALSE);
    555     testAlloc(0x36421500, 0x38b87d23, 1, FALSE);
    556     testAlloc(0x36421500, 0x38b87d23, 20, FALSE);
    557     testAlloc(0x36421500, 0x38b87d23, 200, TRUE);
    558     testAlloc(0x36421500, 0x38b87d23, 13700, FALSE);
    559     testAlloc(0x36421500, 0x37b87d23, 13700, FALSE);
    560     testAlloc(0x36ef1500, 0x37b87d23, 13700, FALSE);
    561     testAlloc(0x36421500, 0x36b87d23, 13700, FALSE);
    562     testAlloc(0x36b87122, 0x36b87d23, 13700, FALSE);
    563     testAlloc(0x49000000, 0x4a600000, 13700, FALSE);
    564     testAlloc(0x9fffffff, 0xd0000000, 13700, FALSE);
    565     testAlloc(0x9fffffff, 0xd0000000, 67400, FALSE);
    566     testAlloc(0x9fffffff, 0xa0030000, 67400, FALSE);
    567     testAlloc(0x9fffffff, 0xa0030000, 40000, FALSE);
    568     testAlloc(0xa0000000, 0xa0030000, 40000, FALSE);
    569     testAlloc(0xa0031100, 0xa0030000, 40000, FALSE);
    570 #if 0
    571 #endif
    572     return 0;
    573 }
    574 
    575 #endif
    576 
    577 #endif /* #if !UCONFIG_NO_COLLATION */
    578