1 /* 2 ******************************************************************************* 3 * Copyright (C) 2012-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * collationbasedatabuilder.cpp 7 * 8 * created on: 2012aug11 9 * created by: Markus W. Scherer 10 */ 11 12 #include "unicode/utypes.h" 13 14 #if !UCONFIG_NO_COLLATION 15 16 #include "unicode/localpointer.h" 17 #include "unicode/ucharstriebuilder.h" 18 #include "unicode/uniset.h" 19 #include "unicode/unistr.h" 20 #include "unicode/utf16.h" 21 #include "collation.h" 22 #include "collationbasedatabuilder.h" 23 #include "collationdata.h" 24 #include "collationdatabuilder.h" 25 #include "collationrootelements.h" 26 #include "normalizer2impl.h" 27 #include "uassert.h" 28 #include "utrie2.h" 29 #include "uvectr32.h" 30 #include "uvectr64.h" 31 #include "uvector.h" 32 33 U_NAMESPACE_BEGIN 34 35 namespace { 36 37 /** 38 * Compare two signed int64_t values as if they were unsigned. 39 */ 40 int32_t 41 compareInt64AsUnsigned(int64_t a, int64_t b) { 42 if((uint64_t)a < (uint64_t)b) { 43 return -1; 44 } else if((uint64_t)a > (uint64_t)b) { 45 return 1; 46 } else { 47 return 0; 48 } 49 } 50 51 // TODO: Try to merge this with the binarySearch in alphaindex.cpp. 52 /** 53 * Like Java Collections.binarySearch(List, String, Comparator). 54 * 55 * @return the index>=0 where the item was found, 56 * or the index<0 for inserting the string at ~index in sorted order 57 */ 58 int32_t 59 binarySearch(const UVector64 &list, int64_t ce) { 60 if (list.size() == 0) { return ~0; } 61 int32_t start = 0; 62 int32_t limit = list.size(); 63 for (;;) { 64 int32_t i = (start + limit) / 2; 65 int32_t cmp = compareInt64AsUnsigned(ce, list.elementAti(i)); 66 if (cmp == 0) { 67 return i; 68 } else if (cmp < 0) { 69 if (i == start) { 70 return ~start; // insert ce before i 71 } 72 limit = i; 73 } else { 74 if (i == start) { 75 return ~(start + 1); // insert ce after i 76 } 77 start = i; 78 } 79 } 80 } 81 82 } // namespace 83 84 CollationBaseDataBuilder::CollationBaseDataBuilder(UErrorCode &errorCode) 85 : CollationDataBuilder(errorCode), 86 numericPrimary(0x12000000), 87 firstHanPrimary(0), lastHanPrimary(0), hanStep(2), 88 rootElements(errorCode) { 89 } 90 91 CollationBaseDataBuilder::~CollationBaseDataBuilder() { 92 } 93 94 void 95 CollationBaseDataBuilder::init(UErrorCode &errorCode) { 96 if(U_FAILURE(errorCode)) { return; } 97 if(trie != NULL) { 98 errorCode = U_INVALID_STATE_ERROR; 99 return; 100 } 101 102 // Not compressible: 103 // - digits 104 // - Latin 105 // - Hani 106 // - trail weights 107 // Some scripts are compressible, some are not. 108 uprv_memset(compressibleBytes, FALSE, 256); 109 compressibleBytes[Collation::UNASSIGNED_IMPLICIT_BYTE] = TRUE; 110 111 // For a base, the default is to compute an unassigned-character implicit CE. 112 // This includes surrogate code points; see the last option in 113 // UCA section 7.1.1 Handling Ill-Formed Code Unit Sequences. 114 trie = utrie2_open(Collation::UNASSIGNED_CE32, Collation::FFFD_CE32, &errorCode); 115 116 // Preallocate trie blocks for Latin in the hope that proximity helps with CPU caches. 117 for(UChar32 c = 0; c < 0x180; ++c) { 118 utrie2_set32(trie, c, Collation::UNASSIGNED_CE32, &errorCode); 119 } 120 121 utrie2_set32(trie, 0xfffe, Collation::MERGE_SEPARATOR_CE32, &errorCode); 122 // No root element for the merge separator which has 02 weights. 123 // Some code assumes that the root first primary CE is the "space first primary" 124 // from FractionalUCA.txt. 125 126 uint32_t hangulCE32 = Collation::makeCE32FromTagAndIndex(Collation::HANGUL_TAG, 0); 127 utrie2_setRange32(trie, Hangul::HANGUL_BASE, Hangul::HANGUL_END, hangulCE32, TRUE, &errorCode); 128 129 // Add a mapping for the first-unassigned boundary, 130 // which is the AlphabeticIndex overflow boundary. 131 UnicodeString s((UChar)0xfdd1); // Script boundary contractions start with U+FDD1. 132 s.append((UChar)0xfdd0); // Zzzz script sample character U+FDD0. 133 int64_t ce = Collation::makeCE(Collation::FIRST_UNASSIGNED_PRIMARY); 134 add(UnicodeString(), s, &ce, 1, errorCode); 135 136 // Add a tailoring boundary, but not a mapping, for [first trailing]. 137 ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY); 138 rootElements.addElement(ce, errorCode); 139 140 // U+FFFD maps to a CE with the third-highest primary weight, 141 // for predictable handling of ill-formed UTF-8. 142 uint32_t ce32 = Collation::FFFD_CE32; 143 utrie2_set32(trie, 0xfffd, ce32, &errorCode); 144 addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode); 145 146 // U+FFFF maps to a CE with the highest primary weight. 147 ce32 = Collation::MAX_REGULAR_CE32; 148 utrie2_set32(trie, 0xffff, ce32, &errorCode); 149 addRootElement(Collation::ceFromSimpleCE32(ce32), errorCode); 150 } 151 152 void 153 CollationBaseDataBuilder::initHanRanges(const UChar32 ranges[], int32_t length, 154 UErrorCode &errorCode) { 155 if(U_FAILURE(errorCode) || length == 0) { return; } 156 if((length & 1) != 0) { // incomplete start/end pairs 157 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 158 return; 159 } 160 if(isAssigned(0x4e00)) { // already set 161 errorCode = U_INVALID_STATE_ERROR; 162 return; 163 } 164 int32_t numHanCodePoints = 0; 165 for(int32_t i = 0; i < length; i += 2) { 166 UChar32 start = ranges[i]; 167 UChar32 end = ranges[i + 1]; 168 numHanCodePoints += end - start + 1; 169 } 170 // Multiply the number of code points by (gap+1). 171 // Add hanStep+2 for tailoring after the last Han character. 172 int32_t gap = 1; 173 hanStep = gap + 1; 174 int32_t numHan = numHanCodePoints * hanStep + hanStep + 2; 175 // Numbers of Han primaries per lead byte determined by 176 // numbers of 2nd (not compressible) times 3rd primary byte values. 177 int32_t numHanPerLeadByte = 254 * 254; 178 int32_t numHanLeadBytes = (numHan + numHanPerLeadByte - 1) / numHanPerLeadByte; 179 uint32_t hanPrimary = (uint32_t)(Collation::UNASSIGNED_IMPLICIT_BYTE - numHanLeadBytes) << 24; 180 hanPrimary |= 0x20200; 181 firstHanPrimary = hanPrimary; 182 for(int32_t i = 0; i < length; i += 2) { 183 UChar32 start = ranges[i]; 184 UChar32 end = ranges[i + 1]; 185 hanPrimary = setPrimaryRangeAndReturnNext(start, end, hanPrimary, hanStep, errorCode); 186 } 187 // One past the actual last one, but that is harmless for tailoring. 188 // It saves us from subtracting "hanStep" and handling underflows. 189 lastHanPrimary = hanPrimary; 190 } 191 192 UBool 193 CollationBaseDataBuilder::isCompressibleLeadByte(uint32_t b) const { 194 return compressibleBytes[b]; 195 } 196 197 void 198 CollationBaseDataBuilder::setCompressibleLeadByte(uint32_t b) { 199 compressibleBytes[b] = TRUE; 200 } 201 202 int32_t 203 CollationBaseDataBuilder::diffTwoBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) { 204 if((p1 & 0xff000000) == (p2 & 0xff000000)) { 205 // Same lead bytes. 206 return (int32_t)(p2 - p1) >> 16; 207 } else { 208 int32_t linear1; 209 int32_t linear2; 210 int32_t factor; 211 if(isCompressible) { 212 // Second byte for compressible lead byte: 251 bytes 04..FE 213 linear1 = (int32_t)((p1 >> 16) & 0xff) - 4; 214 linear2 = (int32_t)((p2 >> 16) & 0xff) - 4; 215 factor = 251; 216 } else { 217 // Second byte for incompressible lead byte: 254 bytes 02..FF 218 linear1 = (int32_t)((p1 >> 16) & 0xff) - 2; 219 linear2 = (int32_t)((p2 >> 16) & 0xff) - 2; 220 factor = 254; 221 } 222 linear1 += factor * (int32_t)((p1 >> 24) & 0xff); 223 linear2 += factor * (int32_t)((p2 >> 24) & 0xff); 224 return linear2 - linear1; 225 } 226 } 227 228 int32_t 229 CollationBaseDataBuilder::diffThreeBytePrimaries(uint32_t p1, uint32_t p2, UBool isCompressible) { 230 if((p1 & 0xffff0000) == (p2 & 0xffff0000)) { 231 // Same first two bytes. 232 return (int32_t)(p2 - p1) >> 8; 233 } else { 234 // Third byte: 254 bytes 02..FF 235 int32_t linear1 = (int32_t)((p1 >> 8) & 0xff) - 2; 236 int32_t linear2 = (int32_t)((p2 >> 8) & 0xff) - 2; 237 int32_t factor; 238 if(isCompressible) { 239 // Second byte for compressible lead byte: 251 bytes 04..FE 240 linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 4); 241 linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 4); 242 factor = 251 * 254; 243 } else { 244 // Second byte for incompressible lead byte: 254 bytes 02..FF 245 linear1 += 254 * ((int32_t)((p1 >> 16) & 0xff) - 2); 246 linear2 += 254 * ((int32_t)((p2 >> 16) & 0xff) - 2); 247 factor = 254 * 254; 248 } 249 linear1 += factor * (int32_t)((p1 >> 24) & 0xff); 250 linear2 += factor * (int32_t)((p2 >> 24) & 0xff); 251 return linear2 - linear1; 252 } 253 } 254 255 uint32_t 256 CollationBaseDataBuilder::encodeCEs(const int64_t ces[], int32_t cesLength, UErrorCode &errorCode) { 257 addRootElements(ces, cesLength, errorCode); 258 return CollationDataBuilder::encodeCEs(ces, cesLength, errorCode); 259 } 260 261 void 262 CollationBaseDataBuilder::addRootElements(const int64_t ces[], int32_t cesLength, 263 UErrorCode &errorCode) { 264 if(U_FAILURE(errorCode)) { return; } 265 for(int32_t i = 0; i < cesLength; ++i) { 266 addRootElement(ces[i], errorCode); 267 } 268 } 269 270 void 271 CollationBaseDataBuilder::addRootElement(int64_t ce, UErrorCode &errorCode) { 272 if(U_FAILURE(errorCode) || ce == 0) { return; } 273 // Remove case bits. 274 ce &= INT64_C(0xffffffffffff3fff); 275 U_ASSERT((ce & 0xc0) == 0); // quaternary==0 276 // Ignore the CE if it has a Han primary weight and common secondary/tertiary weights. 277 // We will add it later, as part of the Han ranges. 278 uint32_t p = (uint32_t)(ce >> 32); 279 uint32_t secTer = (uint32_t)ce; 280 if(secTer == Collation::COMMON_SEC_AND_TER_CE) { 281 if(firstHanPrimary <= p && p <= lastHanPrimary) { 282 return; 283 } 284 } else { 285 // Check that secondary and tertiary weights are >= "common". 286 uint32_t s = secTer >> 16; 287 uint32_t t = secTer & Collation::ONLY_TERTIARY_MASK; 288 if((s != 0 && s < Collation::COMMON_WEIGHT16) || (t != 0 && t < Collation::COMMON_WEIGHT16)) { 289 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 290 return; 291 } 292 } 293 // Check that primaries have at most 3 bytes. 294 if((p & 0xff) != 0) { 295 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 296 return; 297 } 298 int32_t i = binarySearch(rootElements, ce); 299 if(i < 0) { 300 rootElements.insertElementAt(ce, ~i, errorCode); 301 } 302 } 303 304 void 305 CollationBaseDataBuilder::addReorderingGroup(uint32_t firstByte, uint32_t lastByte, 306 const UnicodeString &groupScripts, 307 UErrorCode &errorCode) { 308 if(U_FAILURE(errorCode)) { return; } 309 if(groupScripts.isEmpty()) { 310 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 311 return; 312 } 313 if(groupScripts.indexOf((UChar)USCRIPT_UNKNOWN) >= 0) { 314 // Zzzz must not occur. 315 // It is the code used in the API to separate low and high scripts. 316 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 317 return; 318 } 319 // Note: We are mostly trusting the input data, 320 // rather than verifying that reordering groups do not intersect 321 // with their lead byte ranges nor their sets of scripts, 322 // and that all script codes are valid. 323 scripts.append((UChar)((firstByte << 8) | lastByte)); 324 scripts.append((UChar)groupScripts.length()); 325 scripts.append(groupScripts); 326 } 327 328 void 329 CollationBaseDataBuilder::build(CollationData &data, UErrorCode &errorCode) { 330 buildMappings(data, errorCode); 331 data.numericPrimary = numericPrimary; 332 data.compressibleBytes = compressibleBytes; 333 data.scripts = reinterpret_cast<const uint16_t *>(scripts.getBuffer()); 334 data.scriptsLength = scripts.length(); 335 buildFastLatinTable(data, errorCode); 336 } 337 338 void 339 CollationBaseDataBuilder::buildRootElementsTable(UVector32 &table, UErrorCode &errorCode) { 340 if(U_FAILURE(errorCode)) { return; } 341 uint32_t nextHanPrimary = firstHanPrimary; // Set to 0xffffffff after the last Han range. 342 uint32_t prevPrimary = 0; // Start with primary ignorable CEs. 343 UBool tryRange = FALSE; 344 for(int32_t i = 0; i < rootElements.size(); ++i) { 345 int64_t ce = rootElements.elementAti(i); 346 uint32_t p = (uint32_t)(ce >> 32); 347 uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK; 348 if(p != prevPrimary) { 349 U_ASSERT((p & 0xff) == 0); 350 int32_t end; 351 if(p >= nextHanPrimary) { 352 // Add a Han primary weight or range. 353 // We omitted them initially, and omitted all CEs with Han primaries 354 // and common secondary/tertiary weights. 355 U_ASSERT(p > lastHanPrimary || secTer != Collation::COMMON_SEC_AND_TER_CE); 356 if(p == nextHanPrimary) { 357 // One single Han primary with non-common secondary/tertiary weights. 358 table.addElement((int32_t)p, errorCode); 359 if(p < lastHanPrimary) { 360 // Prepare for the next Han range. 361 nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, FALSE, hanStep); 362 } else { 363 // p is the last Han primary. 364 nextHanPrimary = 0xffffffff; 365 } 366 } else { 367 // p > nextHanPrimary: Add a Han primary range, starting with nextHanPrimary. 368 table.addElement((int32_t)nextHanPrimary, errorCode); 369 if(nextHanPrimary == lastHanPrimary) { 370 // nextHanPrimary == lastHanPrimary < p 371 // We just wrote the single last Han primary. 372 nextHanPrimary = 0xffffffff; 373 } else if(p < lastHanPrimary) { 374 // nextHanPrimary < p < lastHanPrimary 375 // End the Han range on p, prepare for the next range. 376 table.addElement((int32_t)p | hanStep, errorCode); 377 nextHanPrimary = Collation::incThreeBytePrimaryByOffset(p, FALSE, hanStep); 378 } else if(p == lastHanPrimary) { 379 // nextHanPrimary < p == lastHanPrimary 380 // End the last Han range on p. 381 table.addElement((int32_t)p | hanStep, errorCode); 382 nextHanPrimary = 0xffffffff; 383 } else { 384 // nextHanPrimary < lastHanPrimary < p 385 // End the last Han range, then write p. 386 table.addElement((int32_t)lastHanPrimary | hanStep, errorCode); 387 nextHanPrimary = 0xffffffff; 388 table.addElement((int32_t)p, errorCode); 389 } 390 } 391 } else if(tryRange && secTer == Collation::COMMON_SEC_AND_TER_CE && 392 (end = writeRootElementsRange(prevPrimary, p, i + 1, table, errorCode)) != 0) { 393 // Multiple CEs with only common secondary/tertiary weights were 394 // combined into a primary range. 395 // The range end was written, ending with the primary of rootElements[end]. 396 ce = rootElements.elementAti(end); 397 p = (uint32_t)(ce >> 32); 398 secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK; 399 i = end; 400 } else { 401 // Write the primary weight of a normal CE. 402 table.addElement((int32_t)p, errorCode); 403 } 404 prevPrimary = p; 405 } 406 if(secTer == Collation::COMMON_SEC_AND_TER_CE) { 407 // The common secondar/tertiary weights are implied in the primary unit. 408 // If there is no intervening delta unit, then we will try to combine 409 // the next several primaries into a range. 410 tryRange = TRUE; 411 } else { 412 // For each new set of secondary/tertiary weights we write a delta unit. 413 table.addElement((int32_t)secTer | CollationRootElements::SEC_TER_DELTA_FLAG, errorCode); 414 tryRange = FALSE; 415 } 416 } 417 418 // Limit sentinel for root elements. 419 // This allows us to reduce range checks at runtime. 420 table.addElement(CollationRootElements::PRIMARY_SENTINEL, errorCode); 421 } 422 423 int32_t 424 CollationBaseDataBuilder::writeRootElementsRange( 425 uint32_t prevPrimary, uint32_t p, int32_t i, 426 UVector32 &table, UErrorCode &errorCode) { 427 if(U_FAILURE(errorCode) || i >= rootElements.size()) { return 0; } 428 U_ASSERT(prevPrimary < p); 429 // No ranges of single-byte primaries. 430 if((p & prevPrimary & 0xff0000) == 0) { return 0; } 431 // Lead bytes of compressible primaries must match. 432 UBool isCompressible = isCompressiblePrimary(p); 433 if((isCompressible || isCompressiblePrimary(prevPrimary)) && 434 (p & 0xff000000) != (prevPrimary & 0xff000000)) { 435 return 0; 436 } 437 // Number of bytes in the primaries. 438 UBool twoBytes; 439 // Number of primaries from prevPrimary to p. 440 int32_t step; 441 if((p & 0xff00) == 0) { 442 // 2-byte primary 443 if((prevPrimary & 0xff00) != 0) { return 0; } // length mismatch 444 twoBytes = TRUE; 445 step = diffTwoBytePrimaries(prevPrimary, p, isCompressible); 446 } else { 447 // 3-byte primary 448 if((prevPrimary & 0xff00) == 0) { return 0; } // length mismatch 449 twoBytes = FALSE; 450 step = diffThreeBytePrimaries(prevPrimary, p, isCompressible); 451 } 452 if(step > (int32_t)CollationRootElements::PRIMARY_STEP_MASK) { return 0; } 453 // See if there are more than two CEs with primaries increasing by "step" 454 // and with only common secondary/tertiary weights on all but the last one. 455 int32_t end = 0; // Initially 0: No range for just two primaries. 456 for(;;) { 457 prevPrimary = p; 458 // Calculate which primary we expect next. 459 uint32_t nextPrimary; // = p + step 460 if(twoBytes) { 461 nextPrimary = Collation::incTwoBytePrimaryByOffset(p, isCompressible, step); 462 } else { 463 nextPrimary = Collation::incThreeBytePrimaryByOffset(p, isCompressible, step); 464 } 465 // Fetch the actual next CE. 466 int64_t ce = rootElements.elementAti(i); 467 p = (uint32_t)(ce >> 32); 468 uint32_t secTer = (uint32_t)ce & Collation::ONLY_SEC_TER_MASK; 469 // Does this primary increase by "step" from the last one? 470 if(p != nextPrimary || 471 // Do not cross into a new lead byte if either is compressible. 472 ((p & 0xff000000) != (prevPrimary & 0xff000000) && 473 (isCompressible || isCompressiblePrimary(p)))) { 474 // The range ends with the previous CE. 475 p = prevPrimary; 476 break; 477 } 478 // Extend the range to include this primary. 479 end = i++; 480 // This primary is the last in the range if it has non-common weights 481 // or if we are at the end of the list. 482 if(secTer != Collation::COMMON_SEC_AND_TER_CE || i >= rootElements.size()) { break; } 483 } 484 if(end != 0) { 485 table.addElement((int32_t)p | step, errorCode); 486 } 487 return end; 488 } 489 490 U_NAMESPACE_END 491 492 #endif // !UCONFIG_NO_COLLATION 493