1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 1999-2015, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: collationweights.cpp 11 * encoding: UTF-8 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2001mar08 as ucol_wgt.cpp 16 * created by: Markus W. Scherer 17 * 18 * This file contains code for allocating n collation element weights 19 * between two exclusive limits. 20 * It is used only internally by the collation tailoring builder. 21 */ 22 23 #include "unicode/utypes.h" 24 25 #if !UCONFIG_NO_COLLATION 26 27 #include "cmemory.h" 28 #include "collation.h" 29 #include "collationweights.h" 30 #include "uarrsort.h" 31 #include "uassert.h" 32 33 #ifdef UCOL_DEBUG 34 # include <stdio.h> 35 #endif 36 37 U_NAMESPACE_BEGIN 38 39 /* collation element weight allocation -------------------------------------- */ 40 41 /* helper functions for CE weights */ 42 43 static inline uint32_t 44 getWeightTrail(uint32_t weight, int32_t length) { 45 return (uint32_t)(weight>>(8*(4-length)))&0xff; 46 } 47 48 static inline uint32_t 49 setWeightTrail(uint32_t weight, int32_t length, uint32_t trail) { 50 length=8*(4-length); 51 return (uint32_t)((weight&(0xffffff00<<length))|(trail<<length)); 52 } 53 54 static inline uint32_t 55 getWeightByte(uint32_t weight, int32_t idx) { 56 return getWeightTrail(weight, idx); /* same calculation */ 57 } 58 59 static inline uint32_t 60 setWeightByte(uint32_t weight, int32_t idx, uint32_t byte) { 61 uint32_t mask; /* 0xffffffff except a 00 "hole" for the index-th byte */ 62 63 idx*=8; 64 if(idx<32) { 65 mask=((uint32_t)0xffffffff)>>idx; 66 } else { 67 // Do not use uint32_t>>32 because on some platforms that does not shift at all 68 // while we need it to become 0. 69 // PowerPC: 0xffffffff>>32 = 0 (wanted) 70 // x86: 0xffffffff>>32 = 0xffffffff (not wanted) 71 // 72 // ANSI C99 6.5.7 Bitwise shift operators: 73 // "If the value of the right operand is negative 74 // or is greater than or equal to the width of the promoted left operand, 75 // the behavior is undefined." 76 mask=0; 77 } 78 idx=32-idx; 79 mask|=0xffffff00<<idx; 80 return (uint32_t)((weight&mask)|(byte<<idx)); 81 } 82 83 static inline uint32_t 84 truncateWeight(uint32_t weight, int32_t length) { 85 return (uint32_t)(weight&(0xffffffff<<(8*(4-length)))); 86 } 87 88 static inline uint32_t 89 incWeightTrail(uint32_t weight, int32_t length) { 90 return (uint32_t)(weight+(1UL<<(8*(4-length)))); 91 } 92 93 static inline uint32_t 94 decWeightTrail(uint32_t weight, int32_t length) { 95 return (uint32_t)(weight-(1UL<<(8*(4-length)))); 96 } 97 98 CollationWeights::CollationWeights() 99 : middleLength(0), rangeIndex(0), rangeCount(0) { 100 for(int32_t i = 0; i < 5; ++i) { 101 minBytes[i] = maxBytes[i] = 0; 102 } 103 } 104 105 void 106 CollationWeights::initForPrimary(UBool compressible) { 107 middleLength=1; 108 minBytes[1] = Collation::MERGE_SEPARATOR_BYTE + 1; 109 maxBytes[1] = Collation::TRAIL_WEIGHT_BYTE; 110 if(compressible) { 111 minBytes[2] = Collation::PRIMARY_COMPRESSION_LOW_BYTE + 1; 112 maxBytes[2] = Collation::PRIMARY_COMPRESSION_HIGH_BYTE - 1; 113 } else { 114 minBytes[2] = 2; 115 maxBytes[2] = 0xff; 116 } 117 minBytes[3] = 2; 118 maxBytes[3] = 0xff; 119 minBytes[4] = 2; 120 maxBytes[4] = 0xff; 121 } 122 123 void 124 CollationWeights::initForSecondary() { 125 // We use only the lower 16 bits for secondary weights. 126 middleLength=3; 127 minBytes[1] = 0; 128 maxBytes[1] = 0; 129 minBytes[2] = 0; 130 maxBytes[2] = 0; 131 minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1; 132 maxBytes[3] = 0xff; 133 minBytes[4] = 2; 134 maxBytes[4] = 0xff; 135 } 136 137 void 138 CollationWeights::initForTertiary() { 139 // We use only the lower 16 bits for tertiary weights. 140 middleLength=3; 141 minBytes[1] = 0; 142 maxBytes[1] = 0; 143 minBytes[2] = 0; 144 maxBytes[2] = 0; 145 // We use only 6 bits per byte. 146 // The other bits are used for case & quaternary weights. 147 minBytes[3] = Collation::LEVEL_SEPARATOR_BYTE + 1; 148 maxBytes[3] = 0x3f; 149 minBytes[4] = 2; 150 maxBytes[4] = 0x3f; 151 } 152 153 uint32_t 154 CollationWeights::incWeight(uint32_t weight, int32_t length) const { 155 for(;;) { 156 uint32_t byte=getWeightByte(weight, length); 157 if(byte<maxBytes[length]) { 158 return setWeightByte(weight, length, byte+1); 159 } else { 160 // Roll over, set this byte to the minimum and increment the previous one. 161 weight=setWeightByte(weight, length, minBytes[length]); 162 --length; 163 U_ASSERT(length > 0); 164 } 165 } 166 } 167 168 uint32_t 169 CollationWeights::incWeightByOffset(uint32_t weight, int32_t length, int32_t offset) const { 170 for(;;) { 171 offset += getWeightByte(weight, length); 172 if((uint32_t)offset <= maxBytes[length]) { 173 return setWeightByte(weight, length, offset); 174 } else { 175 // Split the offset between this byte and the previous one. 176 offset -= minBytes[length]; 177 weight = setWeightByte(weight, length, minBytes[length] + offset % countBytes(length)); 178 offset /= countBytes(length); 179 --length; 180 U_ASSERT(length > 0); 181 } 182 } 183 } 184 185 void 186 CollationWeights::lengthenRange(WeightRange &range) const { 187 int32_t length=range.length+1; 188 range.start=setWeightTrail(range.start, length, minBytes[length]); 189 range.end=setWeightTrail(range.end, length, maxBytes[length]); 190 range.count*=countBytes(length); 191 range.length=length; 192 } 193 194 /* for uprv_sortArray: sort ranges in weight order */ 195 static int32_t U_CALLCONV 196 compareRanges(const void * /*context*/, const void *left, const void *right) { 197 uint32_t l, r; 198 199 l=((const CollationWeights::WeightRange *)left)->start; 200 r=((const CollationWeights::WeightRange *)right)->start; 201 if(l<r) { 202 return -1; 203 } else if(l>r) { 204 return 1; 205 } else { 206 return 0; 207 } 208 } 209 210 UBool 211 CollationWeights::getWeightRanges(uint32_t lowerLimit, uint32_t upperLimit) { 212 U_ASSERT(lowerLimit != 0); 213 U_ASSERT(upperLimit != 0); 214 215 /* get the lengths of the limits */ 216 int32_t lowerLength=lengthOfWeight(lowerLimit); 217 int32_t upperLength=lengthOfWeight(upperLimit); 218 219 #ifdef UCOL_DEBUG 220 printf("length of lower limit 0x%08lx is %ld\n", lowerLimit, lowerLength); 221 printf("length of upper limit 0x%08lx is %ld\n", upperLimit, upperLength); 222 #endif 223 U_ASSERT(lowerLength>=middleLength); 224 // Permit upperLength<middleLength: The upper limit for secondaries is 0x10000. 225 226 if(lowerLimit>=upperLimit) { 227 #ifdef UCOL_DEBUG 228 printf("error: no space between lower & upper limits\n"); 229 #endif 230 return FALSE; 231 } 232 233 /* check that neither is a prefix of the other */ 234 if(lowerLength<upperLength) { 235 if(lowerLimit==truncateWeight(upperLimit, lowerLength)) { 236 #ifdef UCOL_DEBUG 237 printf("error: lower limit 0x%08lx is a prefix of upper limit 0x%08lx\n", lowerLimit, upperLimit); 238 #endif 239 return FALSE; 240 } 241 } 242 /* if the upper limit is a prefix of the lower limit then the earlier test lowerLimit>=upperLimit has caught it */ 243 244 WeightRange lower[5], middle, upper[5]; /* [0] and [1] are not used - this simplifies indexing */ 245 uprv_memset(lower, 0, sizeof(lower)); 246 uprv_memset(&middle, 0, sizeof(middle)); 247 uprv_memset(upper, 0, sizeof(upper)); 248 249 /* 250 * With the limit lengths of 1..4, there are up to 7 ranges for allocation: 251 * range minimum length 252 * lower[4] 4 253 * lower[3] 3 254 * lower[2] 2 255 * middle 1 256 * upper[2] 2 257 * upper[3] 3 258 * upper[4] 4 259 * 260 * We are now going to calculate up to 7 ranges. 261 * Some of them will typically overlap, so we will then have to merge and eliminate ranges. 262 */ 263 uint32_t weight=lowerLimit; 264 for(int32_t length=lowerLength; length>middleLength; --length) { 265 uint32_t trail=getWeightTrail(weight, length); 266 if(trail<maxBytes[length]) { 267 lower[length].start=incWeightTrail(weight, length); 268 lower[length].end=setWeightTrail(weight, length, maxBytes[length]); 269 lower[length].length=length; 270 lower[length].count=maxBytes[length]-trail; 271 } 272 weight=truncateWeight(weight, length-1); 273 } 274 if(weight<0xff000000) { 275 middle.start=incWeightTrail(weight, middleLength); 276 } else { 277 // Prevent overflow for primary lead byte FF 278 // which would yield a middle range starting at 0. 279 middle.start=0xffffffff; // no middle range 280 } 281 282 weight=upperLimit; 283 for(int32_t length=upperLength; length>middleLength; --length) { 284 uint32_t trail=getWeightTrail(weight, length); 285 if(trail>minBytes[length]) { 286 upper[length].start=setWeightTrail(weight, length, minBytes[length]); 287 upper[length].end=decWeightTrail(weight, length); 288 upper[length].length=length; 289 upper[length].count=trail-minBytes[length]; 290 } 291 weight=truncateWeight(weight, length-1); 292 } 293 middle.end=decWeightTrail(weight, middleLength); 294 295 /* set the middle range */ 296 middle.length=middleLength; 297 if(middle.end>=middle.start) { 298 middle.count=(int32_t)((middle.end-middle.start)>>(8*(4-middleLength)))+1; 299 } else { 300 /* no middle range, eliminate overlaps */ 301 for(int32_t length=4; length>middleLength; --length) { 302 if(lower[length].count>0 && upper[length].count>0) { 303 // Note: The lowerEnd and upperStart weights are versions of 304 // lowerLimit and upperLimit (which are lowerLimit<upperLimit), 305 // truncated (still less-or-equal) 306 // and then with their last bytes changed to the 307 // maxByte (for lowerEnd) or minByte (for upperStart). 308 const uint32_t lowerEnd=lower[length].end; 309 const uint32_t upperStart=upper[length].start; 310 UBool merged=FALSE; 311 312 if(lowerEnd>upperStart) { 313 // These two lower and upper ranges collide. 314 // Since lowerLimit<upperLimit and lowerEnd and upperStart 315 // are versions with only their last bytes modified 316 // (and following ones removed/reset to 0), 317 // lowerEnd>upperStart is only possible 318 // if the leading bytes are equal 319 // and lastByte(lowerEnd)>lastByte(upperStart). 320 U_ASSERT(truncateWeight(lowerEnd, length-1)== 321 truncateWeight(upperStart, length-1)); 322 // Intersect these two ranges. 323 lower[length].end=upper[length].end; 324 lower[length].count= 325 (int32_t)getWeightTrail(lower[length].end, length)- 326 (int32_t)getWeightTrail(lower[length].start, length)+1; 327 // count might be <=0 in which case there is no room, 328 // and the range-collecting code below will ignore this range. 329 merged=TRUE; 330 } else if(lowerEnd==upperStart) { 331 // Not possible, unless minByte==maxByte which is not allowed. 332 U_ASSERT(minBytes[length]<maxBytes[length]); 333 } else /* lowerEnd<upperStart */ { 334 if(incWeight(lowerEnd, length)==upperStart) { 335 // Merge adjacent ranges. 336 lower[length].end=upper[length].end; 337 lower[length].count+=upper[length].count; // might be >countBytes 338 merged=TRUE; 339 } 340 } 341 if(merged) { 342 // Remove all shorter ranges. 343 // There was no room available for them between the ranges we just merged. 344 upper[length].count=0; 345 while(--length>middleLength) { 346 lower[length].count=upper[length].count=0; 347 } 348 break; 349 } 350 } 351 } 352 } 353 354 #ifdef UCOL_DEBUG 355 /* print ranges */ 356 for(int32_t length=4; length>=2; --length) { 357 if(lower[length].count>0) { 358 printf("lower[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, lower[length].start, lower[length].end, lower[length].count); 359 } 360 } 361 if(middle.count>0) { 362 printf("middle .start=0x%08lx .end=0x%08lx .count=%ld\n", middle.start, middle.end, middle.count); 363 } 364 for(int32_t length=2; length<=4; ++length) { 365 if(upper[length].count>0) { 366 printf("upper[%ld] .start=0x%08lx .end=0x%08lx .count=%ld\n", length, upper[length].start, upper[length].end, upper[length].count); 367 } 368 } 369 #endif 370 371 /* copy the ranges, shortest first, into the result array */ 372 rangeCount=0; 373 if(middle.count>0) { 374 uprv_memcpy(ranges, &middle, sizeof(WeightRange)); 375 rangeCount=1; 376 } 377 for(int32_t length=middleLength+1; length<=4; ++length) { 378 /* copy upper first so that later the middle range is more likely the first one to use */ 379 if(upper[length].count>0) { 380 uprv_memcpy(ranges+rangeCount, upper+length, sizeof(WeightRange)); 381 ++rangeCount; 382 } 383 if(lower[length].count>0) { 384 uprv_memcpy(ranges+rangeCount, lower+length, sizeof(WeightRange)); 385 ++rangeCount; 386 } 387 } 388 return rangeCount>0; 389 } 390 391 UBool 392 CollationWeights::allocWeightsInShortRanges(int32_t n, int32_t minLength) { 393 // See if the first few minLength and minLength+1 ranges have enough weights. 394 for(int32_t i = 0; i < rangeCount && ranges[i].length <= (minLength + 1); ++i) { 395 if(n <= ranges[i].count) { 396 // Use the first few minLength and minLength+1 ranges. 397 if(ranges[i].length > minLength) { 398 // Reduce the number of weights from the last minLength+1 range 399 // which might sort before some minLength ranges, 400 // so that we use all weights in the minLength ranges. 401 ranges[i].count = n; 402 } 403 rangeCount = i + 1; 404 #ifdef UCOL_DEBUG 405 printf("take first %ld ranges\n", rangeCount); 406 #endif 407 408 if(rangeCount>1) { 409 /* sort the ranges by weight values */ 410 UErrorCode errorCode=U_ZERO_ERROR; 411 uprv_sortArray(ranges, rangeCount, sizeof(WeightRange), 412 compareRanges, NULL, FALSE, &errorCode); 413 /* ignore error code: we know that the internal sort function will not fail here */ 414 } 415 return TRUE; 416 } 417 n -= ranges[i].count; // still >0 418 } 419 return FALSE; 420 } 421 422 UBool 423 CollationWeights::allocWeightsInMinLengthRanges(int32_t n, int32_t minLength) { 424 // See if the minLength ranges have enough weights 425 // when we split one and lengthen the following ones. 426 int32_t count = 0; 427 int32_t minLengthRangeCount; 428 for(minLengthRangeCount = 0; 429 minLengthRangeCount < rangeCount && 430 ranges[minLengthRangeCount].length == minLength; 431 ++minLengthRangeCount) { 432 count += ranges[minLengthRangeCount].count; 433 } 434 435 int32_t nextCountBytes = countBytes(minLength + 1); 436 if(n > count * nextCountBytes) { return FALSE; } 437 438 // Use the minLength ranges. Merge them, and then split again as necessary. 439 uint32_t start = ranges[0].start; 440 uint32_t end = ranges[0].end; 441 for(int32_t i = 1; i < minLengthRangeCount; ++i) { 442 if(ranges[i].start < start) { start = ranges[i].start; } 443 if(ranges[i].end > end) { end = ranges[i].end; } 444 } 445 446 // Calculate how to split the range between minLength (count1) and minLength+1 (count2). 447 // Goal: 448 // count1 + count2 * nextCountBytes = n 449 // count1 + count2 = count 450 // These turn into 451 // (count - count2) + count2 * nextCountBytes = n 452 // and then into the following count1 & count2 computations. 453 int32_t count2 = (n - count) / (nextCountBytes - 1); // number of weights to be lengthened 454 int32_t count1 = count - count2; // number of minLength weights 455 if(count2 == 0 || (count1 + count2 * nextCountBytes) < n) { 456 // round up 457 ++count2; 458 --count1; 459 U_ASSERT((count1 + count2 * nextCountBytes) >= n); 460 } 461 462 ranges[0].start = start; 463 464 if(count1 == 0) { 465 // Make one long range. 466 ranges[0].end = end; 467 ranges[0].count = count; 468 lengthenRange(ranges[0]); 469 rangeCount = 1; 470 } else { 471 // Split the range, lengthen the second part. 472 #ifdef UCOL_DEBUG 473 printf("split the range number %ld (out of %ld minLength ranges) by %ld:%ld\n", 474 splitRange, rangeCount, count1, count2); 475 #endif 476 477 // Next start = start + count1. First end = 1 before that. 478 ranges[0].end = incWeightByOffset(start, minLength, count1 - 1); 479 ranges[0].count = count1; 480 481 ranges[1].start = incWeight(ranges[0].end, minLength); 482 ranges[1].end = end; 483 ranges[1].length = minLength; // +1 when lengthened 484 ranges[1].count = count2; // *countBytes when lengthened 485 lengthenRange(ranges[1]); 486 rangeCount = 2; 487 } 488 return TRUE; 489 } 490 491 /* 492 * call getWeightRanges and then determine heuristically 493 * which ranges to use for a given number of weights between (excluding) 494 * two limits 495 */ 496 UBool 497 CollationWeights::allocWeights(uint32_t lowerLimit, uint32_t upperLimit, int32_t n) { 498 #ifdef UCOL_DEBUG 499 puts(""); 500 #endif 501 502 if(!getWeightRanges(lowerLimit, upperLimit)) { 503 #ifdef UCOL_DEBUG 504 printf("error: unable to get Weight ranges\n"); 505 #endif 506 return FALSE; 507 } 508 509 /* try until we find suitably large ranges */ 510 for(;;) { 511 /* get the smallest number of bytes in a range */ 512 int32_t minLength=ranges[0].length; 513 514 if(allocWeightsInShortRanges(n, minLength)) { break; } 515 516 if(minLength == 4) { 517 #ifdef UCOL_DEBUG 518 printf("error: the maximum number of %ld weights is insufficient for n=%ld\n", 519 minLengthCount, n); 520 #endif 521 return FALSE; 522 } 523 524 if(allocWeightsInMinLengthRanges(n, minLength)) { break; } 525 526 /* no good match, lengthen all minLength ranges and iterate */ 527 #ifdef UCOL_DEBUG 528 printf("lengthen the short ranges from %ld bytes to %ld and iterate\n", minLength, minLength+1); 529 #endif 530 for(int32_t i=0; i<rangeCount && ranges[i].length==minLength; ++i) { 531 lengthenRange(ranges[i]); 532 } 533 } 534 535 #ifdef UCOL_DEBUG 536 puts("final ranges:"); 537 for(int32_t i=0; i<rangeCount; ++i) { 538 printf("ranges[%ld] .start=0x%08lx .end=0x%08lx .length=%ld .count=%ld\n", 539 i, ranges[i].start, ranges[i].end, ranges[i].length, ranges[i].count); 540 } 541 #endif 542 543 rangeIndex = 0; 544 return TRUE; 545 } 546 547 uint32_t 548 CollationWeights::nextWeight() { 549 if(rangeIndex >= rangeCount) { 550 return 0xffffffff; 551 } else { 552 /* get the next weight */ 553 WeightRange &range = ranges[rangeIndex]; 554 uint32_t weight = range.start; 555 if(--range.count == 0) { 556 /* this range is finished */ 557 ++rangeIndex; 558 } else { 559 /* increment the weight for the next value */ 560 range.start = incWeight(weight, range.length); 561 U_ASSERT(range.start <= range.end); 562 } 563 564 return weight; 565 } 566 } 567 568 U_NAMESPACE_END 569 570 #endif /* #if !UCONFIG_NO_COLLATION */ 571