1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 1999-2010, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: ucol_wgt.c 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2001mar08 14 * created by: Markus W. Scherer 15 * 16 * This file contains code for allocating n collation element weights 17 * between two exclusive limits. 18 * It is used only internally by ucol_bld. 19 */ 20 21 #include "unicode/utypes.h" 22 23 #if !UCONFIG_NO_COLLATION 24 25 #include "ucol_imp.h" 26 #include "ucol_wgt.h" 27 #include "cmemory.h" 28 #include "uarrsort.h" 29 30 #ifdef UCOL_DEBUG 31 # include <stdio.h> 32 #endif 33 34 /* collation element weight allocation -------------------------------------- */ 35 36 /* helper functions for CE weights */ 37 38 static U_INLINE int32_t 39 lengthOfWeight(uint32_t weight) { 40 if((weight&0xffffff)==0) { 41 return 1; 42 } else if((weight&0xffff)==0) { 43 return 2; 44 } else if((weight&0xff)==0) { 45 return 3; 46 } else { 47 return 4; 48 } 49 } 50 51 static U_INLINE uint32_t 52 getWeightTrail(uint32_t weight, int32_t length) { 53 return (uint32_t)(weight>>(8*(4-length)))&0xff; 54 } 55 56 static U_INLINE uint32_t 57 setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) { 58 length=8*(4-length); 59 return (uint32_t)((weight&(0xffffff00<<length))|(trail<<length)); 60 } 61 62 static U_INLINE uint32_t 63 getWeightByte(uint32_t weight, int32_t idx) { 64 return getWeightTrail(weight, idx); /* same calculation */ 65 } 66 67 static U_INLINE uint32_t 68 setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) { 69 uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */ 70 71 idx*=8; 72 mask=((uint32_t)0xffffffff)>>idx; 73 idx=32-idx; 74 mask|=0xffffff00<<idx; 75 return (uint32_t)((weight&mask)|(byte<<idx)); 76 } 77 78 static U_INLINE uint32_t 79 truncateWeight(uint32_t weight, int32_t length) { 80 return (uint32_t)(weight&(0xffffffff<<(8*(4-length)))); 81 } 82 83 static U_INLINE uint32_t 84 incWeightTrail(uint32_t weight, int32_t length) { 85 return (uint32_t)(weight+(1UL<<(8*(4-length)))); 86 } 87 88 static U_INLINE uint32_t 89 decWeightTrail(uint32_t weight, int32_t length) { 90 return (uint32_t)(weight-(1UL<<(8*(4-length)))); 91 } 92 93 static U_INLINE uint32_t 94 incWeight(uint32_t weight, int32_t length, uint32_t maxByte) { 95 uint32_t byte; 96 97 for(;;) { 98 byte=getWeightByte(weight, length); 99 if(byte<maxByte) { 100 return setWeightByte(weight, length, byte+1); 101 } else { 102 /* roll over, set this byte to UCOL_BYTE_FIRST_TAILORED and increment the previous one */ 103 weight=setWeightByte(weight, length, UCOL_BYTE_FIRST_TAILORED); 104 --length; 105 } 106 } 107 } 108 109 static U_INLINE int32_t 110 lengthenRange(WeightRange *range, uint32_t maxByte, uint32_t countBytes) { 111 int32_t length; 112 113 length=range->length2+1; 114 range->start=setWeightTrail(range->start, length, UCOL_BYTE_FIRST_TAILORED); 115 range->end=setWeightTrail(range->end, length, maxByte); 116 range->count2*=countBytes; 117 range->length2=length; 118 return length; 119 } 120 121 /* for uprv_sortArray: sort ranges in weight order */ 122 static int32_t U_CALLCONV 123 compareRanges(const void * /*context*/, const void *left, const void *right) { 124 uint32_t l, r; 125 126 l=((const WeightRange *)left)->start; 127 r=((const WeightRange *)right)->start; 128 if(l<r) { 129 return -1; 130 } else if(l>r) { 131 return 1; 132 } else { 133 return 0; 134 } 135 } 136 137 /* 138 * take two CE weights and calculate the 139 * possible ranges of weights between the two limits, excluding them 140 * for weights with up to 4 bytes there are up to 2*4-1=7 ranges 141 */ 142 static U_INLINE int32_t 143 getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit, 144 uint32_t maxByte, uint32_t countBytes, 145 WeightRange ranges[7]) { 146 WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */ 147 uint32_t weight, trail; 148 int32_t length, lowerLength, upperLength, rangeCount; 149 150 /* assume that both lowerLimit & upperLimit are not 0 */ 151 152 /* get the lengths of the limits */ 153 lowerLength=lengthOfWeight(lowerLimit); 154 upperLength=lengthOfWeight(upperLimit); 155 156 #ifdef UCOL_DEBUG 157 printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength); 158 printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength); 159 #endif 160 161 if(lowerLimit>=upperLimit) { 162 #ifdef UCOL_DEBUG 163 printf("error: no space between lower & upper limits\n"); 164 #endif 165 return 0; 166 } 167 168 /* check that neither is a prefix of the other */ 169 if(lowerLength<upperLength) { 170 if(lowerLimit==truncateWeight(upperLimit, lowerLength)) { 171 #ifdef UCOL_DEBUG 172 printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit); 173 #endif 174 return 0; 175 } 176 } 177 /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */ 178 179 /* reset local variables */ 180 uprv_memset(lower, 0, sizeof(lower)); 181 uprv_memset(&middle, 0, sizeof(middle)); 182 uprv_memset(upper, 0, sizeof(upper)); 183 184 /* 185 * With the limit lengths of 1..4, there are up to 7 ranges for allocation: 186 * range minimum length 187 * lower[4] 4 188 * lower[3] 3 189 * lower[2] 2 190 * middle 1 191 * upper[2] 2 192 * upper[3] 3 193 * upper[4] 4 194 * 195 * We are now going to calculate up to 7 ranges. 196 * Some of them will typically overlap, so we will then have to merge and eliminate ranges. 197 */ 198 weight=lowerLimit; 199 for(length=lowerLength; length>=2; --length) { 200 trail=getWeightTrail(weight, length); 201 if(trail<maxByte) { 202 lower[length].start=incWeightTrail(weight, length); 203 lower[length].end=setWeightTrail(weight, length, maxByte); 204 lower[length].length=length; 205 lower[length].count=maxByte-trail; 206 } 207 weight=truncateWeight(weight, length-1); 208 } 209 middle.start=incWeightTrail(weight, 1); 210 211 weight=upperLimit; 212 for(length=upperLength; length>=2; --length) { 213 trail=getWeightTrail(weight, length); 214 if(trail>UCOL_BYTE_FIRST_TAILORED) { 215 upper[length].start=setWeightTrail(weight, length, UCOL_BYTE_FIRST_TAILORED); 216 upper[length].end=decWeightTrail(weight, length); 217 upper[length].length=length; 218 upper[length].count=trail-UCOL_BYTE_FIRST_TAILORED; 219 } 220 weight=truncateWeight(weight, length-1); 221 } 222 middle.end=decWeightTrail(weight, 1); 223 224 /* set the middle range */ 225 middle.length=1; 226 if(middle.end>=middle.start) { 227 middle.count=(int32_t)((middle.end-middle.start)>>24)+1; 228 } else { 229 /* eliminate overlaps */ 230 uint32_t start, end; 231 232 /* remove the middle range */ 233 middle.count=0; 234 235 /* reduce or remove the lower ranges that go beyond upperLimit */ 236 for(length=4; length>=2; --length) { 237 if(lower[length].count>0 && upper[length].count>0) { 238 start=upper[length].start; 239 end=lower[length].end; 240 241 if(end>=start || incWeight(end, length, maxByte)==start) { 242 /* lower and upper ranges collide or are directly adjacent: merge these two and remove all shorter ranges */ 243 start=lower[length].start; 244 end=lower[length].end=upper[length].end; 245 /* 246 * merging directly adjacent ranges needs to subtract the 0/1 gaps in between; 247 * it may result in a range with count>countBytes 248 */ 249 lower[length].count= 250 (int32_t)(getWeightTrail(end, length)-getWeightTrail(start, length)+1+ 251 countBytes*(getWeightByte(end, length-1)-getWeightByte(start, length-1))); 252 upper[length].count=0; 253 while(--length>=2) { 254 lower[length].count=upper[length].count=0; 255 } 256 break; 257 } 258 } 259 } 260 } 261 262 #ifdef UCOL_DEBUG 263 /* print ranges */ 264 for(length=4; length>=2; --length) { 265 if(lower[length].count>0) { 266 printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count); 267 } 268 } 269 if(middle.count>0) { 270 printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count); 271 } 272 for(length=2; length<=4; ++length) { 273 if(upper[length].count>0) { 274 printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count); 275 } 276 } 277 #endif 278 279 /* copy the ranges, shortest first, into the result array */ 280 rangeCount=0; 281 if(middle.count>0) { 282 uprv_memcpy(ranges, &middle, sizeof(WeightRange)); 283 rangeCount=1; 284 } 285 for(length=2; length<=4; ++length) { 286 /* copy upper first so that later the middle range is more likely the first one to use */ 287 if(upper[length].count>0) { 288 uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange)); 289 ++rangeCount; 290 } 291 if(lower[length].count>0) { 292 uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange)); 293 ++rangeCount; 294 } 295 } 296 return rangeCount; 297 } 298 299 /* 300 * call getWeightRanges and then determine heuristically 301 * which ranges to use for a given number of weights between (excluding) 302 * two limits 303 */ 304 U_CFUNC int32_t 305 ucol_allocWeights(uint32_t lowerLimit, uint32_t upperLimit, 306 uint32_t n, 307 uint32_t maxByte, 308 WeightRange ranges[7]) { 309 /* number of usable byte values 3..maxByte */ 310 uint32_t countBytes=maxByte-UCOL_BYTE_FIRST_TAILORED+1; 311 312 uint32_t lengthCounts[6]; /* [0] unused, [5] to make index checks unnecessary */ 313 uint32_t maxCount; 314 int32_t i, rangeCount, minLength/*, maxLength*/; 315 316 /* countBytes to the power of index */ 317 uint32_t powers[5]; 318 /* gcc requires explicit initialization */ 319 powers[0] = 1; 320 powers[1] = countBytes; 321 powers[2] = countBytes*countBytes; 322 powers[3] = countBytes*countBytes*countBytes; 323 powers[4] = countBytes*countBytes*countBytes*countBytes; 324 325 #ifdef UCOL_DEBUG 326 puts(""); 327 #endif 328 329 rangeCount=getWeightRanges(lowerLimit, upperLimit, maxByte, countBytes, ranges); 330 if(rangeCount<=0) { 331 #ifdef UCOL_DEBUG 332 printf("error: unable to get Weight ranges\n"); 333 #endif 334 return 0; 335 } 336 337 /* what is the maximum number of weights with these ranges? */ 338 maxCount=0; 339 for(i=0; i<rangeCount; ++i) { 340 maxCount+=(uint32_t)ranges[i].count*powers[4-ranges[i].length]; 341 } 342 if(maxCount>=n) { 343 #ifdef UCOL_DEBUG 344 printf("the maximum number of %lu weights is sufficient for n=%lu\n", maxCount, n); 345 #endif 346 } else { 347 #ifdef UCOL_DEBUG 348 printf("error: the maximum number of %lu weights is insufficient for n=%lu\n", maxCount, n); 349 #endif 350 return 0; 351 } 352 353 /* set the length2 and count2 fields */ 354 for(i=0; i<rangeCount; ++i) { 355 ranges[i].length2=ranges[i].length; 356 ranges[i].count2=(uint32_t)ranges[i].count; 357 } 358 359 /* try until we find suitably large ranges */ 360 for(;;) { 361 /* get the smallest number of bytes in a range */ 362 minLength=ranges[0].length2; 363 364 /* sum up the number of elements that fit into ranges of each byte length */ 365 uprv_memset(lengthCounts, 0, sizeof(lengthCounts)); 366 for(i=0; i<rangeCount; ++i) { 367 lengthCounts[ranges[i].length2]+=ranges[i].count2; 368 } 369 370 /* now try to allocate n elements in the available short ranges */ 371 if(n<=(lengthCounts[minLength]+lengthCounts[minLength+1])) { 372 /* trivial cases, use the first few ranges */ 373 maxCount=0; 374 rangeCount=0; 375 do { 376 maxCount+=ranges[rangeCount].count2; 377 ++rangeCount; 378 } while(n>maxCount); 379 #ifdef UCOL_DEBUG 380 printf("take first %ld ranges\n", rangeCount); 381 #endif 382 break; 383 } else if(n<=ranges[0].count2*countBytes) { 384 /* easy case, just make this one range large enough by lengthening it once more, possibly split it */ 385 uint32_t count1, count2, power_1, power; 386 387 /*maxLength=minLength+1;*/ 388 389 /* calculate how to split the range between maxLength-1 (count1) and maxLength (count2) */ 390 power_1=powers[minLength-ranges[0].length]; 391 power=power_1*countBytes; 392 count2=(n+power-1)/power; 393 count1=ranges[0].count-count2; 394 395 /* split the range */ 396 #ifdef UCOL_DEBUG 397 printf("split the first range %ld:%ld\n", count1, count2); 398 #endif 399 if(count1<1) { 400 rangeCount=1; 401 402 /* lengthen the entire range to maxLength */ 403 lengthenRange(ranges, maxByte, countBytes); 404 } else { 405 /* really split the range */ 406 uint32_t byte; 407 408 /* create a new range with the end and initial and current length of the old one */ 409 rangeCount=2; 410 ranges[1].end=ranges[0].end; 411 ranges[1].length=ranges[0].length; 412 ranges[1].length2=minLength; 413 414 /* set the end of the first range according to count1 */ 415 i=ranges[0].length; 416 byte=getWeightByte(ranges[0].start, i)+count1-1; 417 418 /* 419 * ranges[0].count and count1 may be >countBytes 420 * from merging adjacent ranges; 421 * byte>maxByte is possible 422 */ 423 if(byte<=maxByte) { 424 ranges[0].end=setWeightByte(ranges[0].start, i, byte); 425 } else /* byte>maxByte */ { 426 ranges[0].end=setWeightByte(incWeight(ranges[0].start, i-1, maxByte), i, byte-countBytes); 427 } 428 429 /* set the bytes in the end weight at length+1..length2 to maxByte */ 430 byte=(maxByte<<24)|(maxByte<<16)|(maxByte<<8)|maxByte; /* this used to be 0xffffffff */ 431 ranges[0].end=truncateWeight(ranges[0].end, i)| 432 ((byte>>(8*i))&(byte<<(8*(4-minLength)))); 433 434 /* set the start of the second range to immediately follow the end of the first one */ 435 ranges[1].start=incWeight(ranges[0].end, minLength, maxByte); 436 437 /* set the count values (informational) */ 438 ranges[0].count=count1; 439 ranges[1].count=count2; 440 441 ranges[0].count2=count1*power_1; 442 ranges[1].count2=count2*power_1; /* will be *countBytes when lengthened */ 443 444 /* lengthen the second range to maxLength */ 445 lengthenRange(ranges+1, maxByte, countBytes); 446 } 447 break; 448 } 449 450 /* no good match, lengthen all minLength ranges and iterate */ 451 #ifdef UCOL_DEBUG 452 printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1); 453 #endif 454 for(i=0; ranges[i].length2==minLength; ++i) { 455 lengthenRange(ranges+i, maxByte, countBytes); 456 } 457 } 458 459 if(rangeCount>1) { 460 /* sort the ranges by weight values */ 461 UErrorCode errorCode=U_ZERO_ERROR; 462 uprv_sortArray(ranges, rangeCount, sizeof(WeightRange), compareRanges, NULL, FALSE, &errorCode); 463 /* ignore error code: we know that the internal sort function will not fail here */ 464 } 465 466 #ifdef UCOL_DEBUG 467 puts("final ranges:"); 468 for(i=0; i<rangeCount; ++i) { 469 printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .length2=%ld .count=%ld .count2=%lu\n", 470 i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].length2, ranges[i].count, ranges[i].count2); 471 } 472 #endif 473 474 /* set maxByte in ranges[0] for ucol_nextWeight() */ 475 ranges[0].count=maxByte; 476 477 return rangeCount; 478 } 479 480 /* 481 * given a set of ranges calculated by ucol_allocWeights(), 482 * iterate through the weights 483 */ 484 U_CFUNC uint32_t 485 ucol_nextWeight(WeightRange ranges[], int32_t *pRangeCount) { 486 if(*pRangeCount<=0) { 487 return 0xffffffff; 488 } else { 489 uint32_t weight, maxByte; 490 491 /* get maxByte from the .count field */ 492 maxByte=ranges[0].count; 493 494 /* get the next weight */ 495 weight=ranges[0].start; 496 if(weight==ranges[0].end) { 497 /* this range is finished, remove it and move the following ones up */ 498 if(--*pRangeCount>0) { 499 uprv_memmove(ranges, ranges+1, *pRangeCount*sizeof(WeightRange)); 500 ranges[0].count=maxByte; /* keep maxByte in ranges[0] */ 501 } 502 } else { 503 /* increment the weight for the next value */ 504 ranges[0].start=incWeight(weight, ranges[0].length2, maxByte); 505 } 506 507 return weight; 508 } 509 } 510 511 #if 0 // #ifdef UCOL_DEBUG 512 513 static void 514 testAlloc(uint32_t lowerLimit, uint32_t upperLimit, uint32_t n, UBool enumerate) { 515 WeightRange ranges[8]; 516 int32_t rangeCount; 517 518 rangeCount=ucol_allocWeights(lowerLimit, upperLimit, n, ranges); 519 if(enumerate) { 520 uint32_t weight; 521 522 while(n>0) { 523 weight=ucol_nextWeight(ranges, &rangeCount); 524 if(weight==0xffffffff) { 525 printf("error: 0xffffffff with %lu more weights to go\n", n); 526 break; 527 } 528 printf(" 0x%08lx\n", weight); 529 --n; 530 } 531 } 532 } 533 534 extern int 535 main(int argc, const char *argv[]) { 536 #if 0 537 #endif 538 testAlloc(0x364214fc, 0x44b87d23, 5, FALSE); 539 testAlloc(0x36421500, 0x44b87d23, 5, FALSE); 540 testAlloc(0x36421500, 0x44b87d23, 20, FALSE); 541 testAlloc(0x36421500, 0x44b87d23, 13700, FALSE); 542 testAlloc(0x36421500, 0x38b87d23, 1, FALSE); 543 testAlloc(0x36421500, 0x38b87d23, 20, FALSE); 544 testAlloc(0x36421500, 0x38b87d23, 200, TRUE); 545 testAlloc(0x36421500, 0x38b87d23, 13700, FALSE); 546 testAlloc(0x36421500, 0x37b87d23, 13700, FALSE); 547 testAlloc(0x36ef1500, 0x37b87d23, 13700, FALSE); 548 testAlloc(0x36421500, 0x36b87d23, 13700, FALSE); 549 testAlloc(0x36b87122, 0x36b87d23, 13700, FALSE); 550 testAlloc(0x49000000, 0x4a600000, 13700, FALSE); 551 testAlloc(0x9fffffff, 0xd0000000, 13700, FALSE); 552 testAlloc(0x9fffffff, 0xd0000000, 67400, FALSE); 553 testAlloc(0x9fffffff, 0xa0030000, 67400, FALSE); 554 testAlloc(0x9fffffff, 0xa0030000, 40000, FALSE); 555 testAlloc(0xa0000000, 0xa0030000, 40000, FALSE); 556 testAlloc(0xa0031100, 0xa0030000, 40000, FALSE); 557 #if 0 558 #endif 559 return 0; 560 } 561 562 #endif 563 564 #endif /* #if !UCONFIG_NO_COLLATION */ 565