1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2013-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * collationbuilder.cpp 9 * 10 * (replaced the former ucol_bld.cpp) 11 * 12 * created on: 2013may06 13 * created by: Markus W. Scherer 14 */ 15 16 #ifdef DEBUG_COLLATION_BUILDER 17 #include <stdio.h> 18 #endif 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_COLLATION 23 24 #include "unicode/caniter.h" 25 #include "unicode/normalizer2.h" 26 #include "unicode/tblcoll.h" 27 #include "unicode/parseerr.h" 28 #include "unicode/uchar.h" 29 #include "unicode/ucol.h" 30 #include "unicode/unistr.h" 31 #include "unicode/usetiter.h" 32 #include "unicode/utf16.h" 33 #include "unicode/uversion.h" 34 #include "cmemory.h" 35 #include "collation.h" 36 #include "collationbuilder.h" 37 #include "collationdata.h" 38 #include "collationdatabuilder.h" 39 #include "collationfastlatin.h" 40 #include "collationroot.h" 41 #include "collationrootelements.h" 42 #include "collationruleparser.h" 43 #include "collationsettings.h" 44 #include "collationtailoring.h" 45 #include "collationweights.h" 46 #include "normalizer2impl.h" 47 #include "uassert.h" 48 #include "ucol_imp.h" 49 #include "utf16collationiterator.h" 50 51 U_NAMESPACE_BEGIN 52 53 namespace { 54 55 class BundleImporter : public CollationRuleParser::Importer { 56 public: 57 BundleImporter() {} 58 virtual ~BundleImporter(); 59 virtual void getRules( 60 const char *localeID, const char *collationType, 61 UnicodeString &rules, 62 const char *&errorReason, UErrorCode &errorCode); 63 }; 64 65 BundleImporter::~BundleImporter() {} 66 67 void 68 BundleImporter::getRules( 69 const char *localeID, const char *collationType, 70 UnicodeString &rules, 71 const char *& /*errorReason*/, UErrorCode &errorCode) { 72 CollationLoader::loadRules(localeID, collationType, rules, errorCode); 73 } 74 75 } // namespace 76 77 // RuleBasedCollator implementation ---------------------------------------- *** 78 79 // These methods are here, rather than in rulebasedcollator.cpp, 80 // for modularization: 81 // Most code using Collator does not need to build a Collator from rules. 82 // By moving these constructors and helper methods to a separate file, 83 // most code will not have a static dependency on the builder code. 84 85 RuleBasedCollator::RuleBasedCollator() 86 : data(NULL), 87 settings(NULL), 88 tailoring(NULL), 89 cacheEntry(NULL), 90 validLocale(""), 91 explicitlySetAttributes(0), 92 actualLocaleIsSameAsValid(FALSE) { 93 } 94 95 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, UErrorCode &errorCode) 96 : data(NULL), 97 settings(NULL), 98 tailoring(NULL), 99 cacheEntry(NULL), 100 validLocale(""), 101 explicitlySetAttributes(0), 102 actualLocaleIsSameAsValid(FALSE) { 103 internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, NULL, NULL, errorCode); 104 } 105 106 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, ECollationStrength strength, 107 UErrorCode &errorCode) 108 : data(NULL), 109 settings(NULL), 110 tailoring(NULL), 111 cacheEntry(NULL), 112 validLocale(""), 113 explicitlySetAttributes(0), 114 actualLocaleIsSameAsValid(FALSE) { 115 internalBuildTailoring(rules, strength, UCOL_DEFAULT, NULL, NULL, errorCode); 116 } 117 118 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, 119 UColAttributeValue decompositionMode, 120 UErrorCode &errorCode) 121 : data(NULL), 122 settings(NULL), 123 tailoring(NULL), 124 cacheEntry(NULL), 125 validLocale(""), 126 explicitlySetAttributes(0), 127 actualLocaleIsSameAsValid(FALSE) { 128 internalBuildTailoring(rules, UCOL_DEFAULT, decompositionMode, NULL, NULL, errorCode); 129 } 130 131 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, 132 ECollationStrength strength, 133 UColAttributeValue decompositionMode, 134 UErrorCode &errorCode) 135 : data(NULL), 136 settings(NULL), 137 tailoring(NULL), 138 cacheEntry(NULL), 139 validLocale(""), 140 explicitlySetAttributes(0), 141 actualLocaleIsSameAsValid(FALSE) { 142 internalBuildTailoring(rules, strength, decompositionMode, NULL, NULL, errorCode); 143 } 144 145 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, 146 UParseError &parseError, UnicodeString &reason, 147 UErrorCode &errorCode) 148 : data(NULL), 149 settings(NULL), 150 tailoring(NULL), 151 cacheEntry(NULL), 152 validLocale(""), 153 explicitlySetAttributes(0), 154 actualLocaleIsSameAsValid(FALSE) { 155 internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &reason, errorCode); 156 } 157 158 void 159 RuleBasedCollator::internalBuildTailoring(const UnicodeString &rules, 160 int32_t strength, 161 UColAttributeValue decompositionMode, 162 UParseError *outParseError, UnicodeString *outReason, 163 UErrorCode &errorCode) { 164 const CollationTailoring *base = CollationRoot::getRoot(errorCode); 165 if(U_FAILURE(errorCode)) { return; } 166 if(outReason != NULL) { outReason->remove(); } 167 CollationBuilder builder(base, errorCode); 168 UVersionInfo noVersion = { 0, 0, 0, 0 }; 169 BundleImporter importer; 170 LocalPointer<CollationTailoring> t(builder.parseAndBuild(rules, noVersion, 171 &importer, 172 outParseError, errorCode)); 173 if(U_FAILURE(errorCode)) { 174 const char *reason = builder.getErrorReason(); 175 if(reason != NULL && outReason != NULL) { 176 *outReason = UnicodeString(reason, -1, US_INV); 177 } 178 return; 179 } 180 t->actualLocale.setToBogus(); 181 adoptTailoring(t.orphan(), errorCode); 182 // Set attributes after building the collator, 183 // to keep the default settings consistent with the rule string. 184 if(strength != UCOL_DEFAULT) { 185 setAttribute(UCOL_STRENGTH, (UColAttributeValue)strength, errorCode); 186 } 187 if(decompositionMode != UCOL_DEFAULT) { 188 setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode); 189 } 190 } 191 192 // CollationBuilder implementation ----------------------------------------- *** 193 194 // Some compilers don't care if constants are defined in the .cpp file. 195 // MS Visual C++ does not like it, but gcc requires it. clang does not care. 196 #ifndef _MSC_VER 197 const int32_t CollationBuilder::HAS_BEFORE2; 198 const int32_t CollationBuilder::HAS_BEFORE3; 199 #endif 200 201 CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode) 202 : nfd(*Normalizer2::getNFDInstance(errorCode)), 203 fcd(*Normalizer2Factory::getFCDInstance(errorCode)), 204 nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)), 205 base(b), 206 baseData(b->data), 207 rootElements(b->data->rootElements, b->data->rootElementsLength), 208 variableTop(0), 209 dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRUE), 210 errorReason(NULL), 211 cesLength(0), 212 rootPrimaryIndexes(errorCode), nodes(errorCode) { 213 nfcImpl.ensureCanonIterData(errorCode); 214 if(U_FAILURE(errorCode)) { 215 errorReason = "CollationBuilder fields initialization failed"; 216 return; 217 } 218 if(dataBuilder == NULL) { 219 errorCode = U_MEMORY_ALLOCATION_ERROR; 220 return; 221 } 222 dataBuilder->initForTailoring(baseData, errorCode); 223 if(U_FAILURE(errorCode)) { 224 errorReason = "CollationBuilder initialization failed"; 225 } 226 } 227 228 CollationBuilder::~CollationBuilder() { 229 delete dataBuilder; 230 } 231 232 CollationTailoring * 233 CollationBuilder::parseAndBuild(const UnicodeString &ruleString, 234 const UVersionInfo rulesVersion, 235 CollationRuleParser::Importer *importer, 236 UParseError *outParseError, 237 UErrorCode &errorCode) { 238 if(U_FAILURE(errorCode)) { return NULL; } 239 if(baseData->rootElements == NULL) { 240 errorCode = U_MISSING_RESOURCE_ERROR; 241 errorReason = "missing root elements data, tailoring not supported"; 242 return NULL; 243 } 244 LocalPointer<CollationTailoring> tailoring(new CollationTailoring(base->settings)); 245 if(tailoring.isNull() || tailoring->isBogus()) { 246 errorCode = U_MEMORY_ALLOCATION_ERROR; 247 return NULL; 248 } 249 CollationRuleParser parser(baseData, errorCode); 250 if(U_FAILURE(errorCode)) { return NULL; } 251 // Note: This always bases &[last variable] and &[first regular] 252 // on the root collator's maxVariable/variableTop. 253 // If we wanted this to change after [maxVariable x], then we would keep 254 // the tailoring.settings pointer here and read its variableTop when we need it. 255 // See http://unicode.org/cldr/trac/ticket/6070 256 variableTop = base->settings->variableTop; 257 parser.setSink(this); 258 parser.setImporter(importer); 259 CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->settings); 260 parser.parse(ruleString, ownedSettings, outParseError, errorCode); 261 errorReason = parser.getErrorReason(); 262 if(U_FAILURE(errorCode)) { return NULL; } 263 if(dataBuilder->hasMappings()) { 264 makeTailoredCEs(errorCode); 265 closeOverComposites(errorCode); 266 finalizeCEs(errorCode); 267 // Copy all of ASCII, and Latin-1 letters, into each tailoring. 268 optimizeSet.add(0, 0x7f); 269 optimizeSet.add(0xc0, 0xff); 270 // Hangul is decomposed on the fly during collation, 271 // and the tailoring data is always built with HANGUL_TAG specials. 272 optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END); 273 dataBuilder->optimize(optimizeSet, errorCode); 274 tailoring->ensureOwnedData(errorCode); 275 if(U_FAILURE(errorCode)) { return NULL; } 276 if(fastLatinEnabled) { dataBuilder->enableFastLatin(); } 277 dataBuilder->build(*tailoring->ownedData, errorCode); 278 tailoring->builder = dataBuilder; 279 dataBuilder = NULL; 280 } else { 281 tailoring->data = baseData; 282 } 283 if(U_FAILURE(errorCode)) { return NULL; } 284 ownedSettings.fastLatinOptions = CollationFastLatin::getOptions( 285 tailoring->data, ownedSettings, 286 ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries)); 287 tailoring->rules = ruleString; 288 tailoring->rules.getTerminatedBuffer(); // ensure NUL-termination 289 tailoring->setVersion(base->version, rulesVersion); 290 return tailoring.orphan(); 291 } 292 293 void 294 CollationBuilder::addReset(int32_t strength, const UnicodeString &str, 295 const char *&parserErrorReason, UErrorCode &errorCode) { 296 if(U_FAILURE(errorCode)) { return; } 297 U_ASSERT(!str.isEmpty()); 298 if(str.charAt(0) == CollationRuleParser::POS_LEAD) { 299 ces[0] = getSpecialResetPosition(str, parserErrorReason, errorCode); 300 cesLength = 1; 301 if(U_FAILURE(errorCode)) { return; } 302 U_ASSERT((ces[0] & Collation::CASE_AND_QUATERNARY_MASK) == 0); 303 } else { 304 // normal reset to a character or string 305 UnicodeString nfdString = nfd.normalize(str, errorCode); 306 if(U_FAILURE(errorCode)) { 307 parserErrorReason = "normalizing the reset position"; 308 return; 309 } 310 cesLength = dataBuilder->getCEs(nfdString, ces, 0); 311 if(cesLength > Collation::MAX_EXPANSION_LENGTH) { 312 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 313 parserErrorReason = "reset position maps to too many collation elements (more than 31)"; 314 return; 315 } 316 } 317 if(strength == UCOL_IDENTICAL) { return; } // simple reset-at-position 318 319 // &[before strength]position 320 U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_TERTIARY); 321 int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode); 322 if(U_FAILURE(errorCode)) { return; } 323 324 int64_t node = nodes.elementAti(index); 325 // If the index is for a "weaker" node, 326 // then skip backwards over this and further "weaker" nodes. 327 while(strengthFromNode(node) > strength) { 328 index = previousIndexFromNode(node); 329 node = nodes.elementAti(index); 330 } 331 332 // Find or insert a node whose index we will put into a temporary CE. 333 if(strengthFromNode(node) == strength && isTailoredNode(node)) { 334 // Reset to just before this same-strength tailored node. 335 index = previousIndexFromNode(node); 336 } else if(strength == UCOL_PRIMARY) { 337 // root primary node (has no previous index) 338 uint32_t p = weight32FromNode(node); 339 if(p == 0) { 340 errorCode = U_UNSUPPORTED_ERROR; 341 parserErrorReason = "reset primary-before ignorable not possible"; 342 return; 343 } 344 if(p <= rootElements.getFirstPrimary()) { 345 // There is no primary gap between ignorables and the space-first-primary. 346 errorCode = U_UNSUPPORTED_ERROR; 347 parserErrorReason = "reset primary-before first non-ignorable not supported"; 348 return; 349 } 350 if(p == Collation::FIRST_TRAILING_PRIMARY) { 351 // We do not support tailoring to an unassigned-implicit CE. 352 errorCode = U_UNSUPPORTED_ERROR; 353 parserErrorReason = "reset primary-before [first trailing] not supported"; 354 return; 355 } 356 p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p)); 357 index = findOrInsertNodeForPrimary(p, errorCode); 358 // Go to the last node in this list: 359 // Tailor after the last node between adjacent root nodes. 360 for(;;) { 361 node = nodes.elementAti(index); 362 int32_t nextIndex = nextIndexFromNode(node); 363 if(nextIndex == 0) { break; } 364 index = nextIndex; 365 } 366 } else { 367 // &[before 2] or &[before 3] 368 index = findCommonNode(index, UCOL_SECONDARY); 369 if(strength >= UCOL_TERTIARY) { 370 index = findCommonNode(index, UCOL_TERTIARY); 371 } 372 // findCommonNode() stayed on the stronger node or moved to 373 // an explicit common-weight node of the reset-before strength. 374 node = nodes.elementAti(index); 375 if(strengthFromNode(node) == strength) { 376 // Found a same-strength node with an explicit weight. 377 uint32_t weight16 = weight16FromNode(node); 378 if(weight16 == 0) { 379 errorCode = U_UNSUPPORTED_ERROR; 380 if(strength == UCOL_SECONDARY) { 381 parserErrorReason = "reset secondary-before secondary ignorable not possible"; 382 } else { 383 parserErrorReason = "reset tertiary-before completely ignorable not possible"; 384 } 385 return; 386 } 387 U_ASSERT(weight16 > Collation::BEFORE_WEIGHT16); 388 // Reset to just before this node. 389 // Insert the preceding same-level explicit weight if it is not there already. 390 // Which explicit weight immediately precedes this one? 391 weight16 = getWeight16Before(index, node, strength); 392 // Does this preceding weight have a node? 393 uint32_t previousWeight16; 394 int32_t previousIndex = previousIndexFromNode(node); 395 for(int32_t i = previousIndex;; i = previousIndexFromNode(node)) { 396 node = nodes.elementAti(i); 397 int32_t previousStrength = strengthFromNode(node); 398 if(previousStrength < strength) { 399 U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16 || i == previousIndex); 400 // Either the reset element has an above-common weight and 401 // the parent node provides the implied common weight, 402 // or the reset element has a weight<=common in the node 403 // right after the parent, and we need to insert the preceding weight. 404 previousWeight16 = Collation::COMMON_WEIGHT16; 405 break; 406 } else if(previousStrength == strength && !isTailoredNode(node)) { 407 previousWeight16 = weight16FromNode(node); 408 break; 409 } 410 // Skip weaker nodes and same-level tailored nodes. 411 } 412 if(previousWeight16 == weight16) { 413 // The preceding weight has a node, 414 // maybe with following weaker or tailored nodes. 415 // Reset to the last of them. 416 index = previousIndex; 417 } else { 418 // Insert a node with the preceding weight, reset to that. 419 node = nodeFromWeight16(weight16) | nodeFromStrength(strength); 420 index = insertNodeBetween(previousIndex, index, node, errorCode); 421 } 422 } else { 423 // Found a stronger node with implied strength-common weight. 424 uint32_t weight16 = getWeight16Before(index, node, strength); 425 index = findOrInsertWeakNode(index, weight16, strength, errorCode); 426 } 427 // Strength of the temporary CE = strength of its reset position. 428 // Code above raises an error if the before-strength is stronger. 429 strength = ceStrength(ces[cesLength - 1]); 430 } 431 if(U_FAILURE(errorCode)) { 432 parserErrorReason = "inserting reset position for &[before n]"; 433 return; 434 } 435 ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength); 436 } 437 438 uint32_t 439 CollationBuilder::getWeight16Before(int32_t index, int64_t node, int32_t level) { 440 U_ASSERT(strengthFromNode(node) < level || !isTailoredNode(node)); 441 // Collect the root CE weights if this node is for a root CE. 442 // If it is not, then return the low non-primary boundary for a tailored CE. 443 uint32_t t; 444 if(strengthFromNode(node) == UCOL_TERTIARY) { 445 t = weight16FromNode(node); 446 } else { 447 t = Collation::COMMON_WEIGHT16; // Stronger node with implied common weight. 448 } 449 while(strengthFromNode(node) > UCOL_SECONDARY) { 450 index = previousIndexFromNode(node); 451 node = nodes.elementAti(index); 452 } 453 if(isTailoredNode(node)) { 454 return Collation::BEFORE_WEIGHT16; 455 } 456 uint32_t s; 457 if(strengthFromNode(node) == UCOL_SECONDARY) { 458 s = weight16FromNode(node); 459 } else { 460 s = Collation::COMMON_WEIGHT16; // Stronger node with implied common weight. 461 } 462 while(strengthFromNode(node) > UCOL_PRIMARY) { 463 index = previousIndexFromNode(node); 464 node = nodes.elementAti(index); 465 } 466 if(isTailoredNode(node)) { 467 return Collation::BEFORE_WEIGHT16; 468 } 469 // [p, s, t] is a root CE. Return the preceding weight for the requested level. 470 uint32_t p = weight32FromNode(node); 471 uint32_t weight16; 472 if(level == UCOL_SECONDARY) { 473 weight16 = rootElements.getSecondaryBefore(p, s); 474 } else { 475 weight16 = rootElements.getTertiaryBefore(p, s, t); 476 U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0); 477 } 478 return weight16; 479 } 480 481 int64_t 482 CollationBuilder::getSpecialResetPosition(const UnicodeString &str, 483 const char *&parserErrorReason, UErrorCode &errorCode) { 484 U_ASSERT(str.length() == 2); 485 int64_t ce; 486 int32_t strength = UCOL_PRIMARY; 487 UBool isBoundary = FALSE; 488 UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE; 489 U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING); 490 switch(pos) { 491 case CollationRuleParser::FIRST_TERTIARY_IGNORABLE: 492 // Quaternary CEs are not supported. 493 // Non-zero quaternary weights are possible only on tertiary or stronger CEs. 494 return 0; 495 case CollationRuleParser::LAST_TERTIARY_IGNORABLE: 496 return 0; 497 case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: { 498 // Look for a tailored tertiary node after [0, 0, 0]. 499 int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode); 500 if(U_FAILURE(errorCode)) { return 0; } 501 int64_t node = nodes.elementAti(index); 502 if((index = nextIndexFromNode(node)) != 0) { 503 node = nodes.elementAti(index); 504 U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY); 505 if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY) { 506 return tempCEFromIndexAndStrength(index, UCOL_TERTIARY); 507 } 508 } 509 return rootElements.getFirstTertiaryCE(); 510 // No need to look for nodeHasAnyBefore() on a tertiary node. 511 } 512 case CollationRuleParser::LAST_SECONDARY_IGNORABLE: 513 ce = rootElements.getLastTertiaryCE(); 514 strength = UCOL_TERTIARY; 515 break; 516 case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: { 517 // Look for a tailored secondary node after [0, 0, *]. 518 int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode); 519 if(U_FAILURE(errorCode)) { return 0; } 520 int64_t node = nodes.elementAti(index); 521 while((index = nextIndexFromNode(node)) != 0) { 522 node = nodes.elementAti(index); 523 strength = strengthFromNode(node); 524 if(strength < UCOL_SECONDARY) { break; } 525 if(strength == UCOL_SECONDARY) { 526 if(isTailoredNode(node)) { 527 if(nodeHasBefore3(node)) { 528 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 529 U_ASSERT(isTailoredNode(nodes.elementAti(index))); 530 } 531 return tempCEFromIndexAndStrength(index, UCOL_SECONDARY); 532 } else { 533 break; 534 } 535 } 536 } 537 ce = rootElements.getFirstSecondaryCE(); 538 strength = UCOL_SECONDARY; 539 break; 540 } 541 case CollationRuleParser::LAST_PRIMARY_IGNORABLE: 542 ce = rootElements.getLastSecondaryCE(); 543 strength = UCOL_SECONDARY; 544 break; 545 case CollationRuleParser::FIRST_VARIABLE: 546 ce = rootElements.getFirstPrimaryCE(); 547 isBoundary = TRUE; // FractionalUCA.txt: FDD1 00A0, SPACE first primary 548 break; 549 case CollationRuleParser::LAST_VARIABLE: 550 ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1); 551 break; 552 case CollationRuleParser::FIRST_REGULAR: 553 ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1); 554 isBoundary = TRUE; // FractionalUCA.txt: FDD1 263A, SYMBOL first primary 555 break; 556 case CollationRuleParser::LAST_REGULAR: 557 // Use the Hani-first-primary rather than the actual last "regular" CE before it, 558 // for backward compatibility with behavior before the introduction of 559 // script-first-primary CEs in the root collator. 560 ce = rootElements.firstCEWithPrimaryAtLeast( 561 baseData->getFirstPrimaryForGroup(USCRIPT_HAN)); 562 break; 563 case CollationRuleParser::FIRST_IMPLICIT: 564 ce = baseData->getSingleCE(0x4e00, errorCode); 565 break; 566 case CollationRuleParser::LAST_IMPLICIT: 567 // We do not support tailoring to an unassigned-implicit CE. 568 errorCode = U_UNSUPPORTED_ERROR; 569 parserErrorReason = "reset to [last implicit] not supported"; 570 return 0; 571 case CollationRuleParser::FIRST_TRAILING: 572 ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY); 573 isBoundary = TRUE; // trailing first primary (there is no mapping for it) 574 break; 575 case CollationRuleParser::LAST_TRAILING: 576 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 577 parserErrorReason = "LDML forbids tailoring to U+FFFF"; 578 return 0; 579 default: 580 U_ASSERT(FALSE); 581 return 0; 582 } 583 584 int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode); 585 if(U_FAILURE(errorCode)) { return 0; } 586 int64_t node = nodes.elementAti(index); 587 if((pos & 1) == 0) { 588 // even pos = [first xyz] 589 if(!nodeHasAnyBefore(node) && isBoundary) { 590 // A <group> first primary boundary is artificially added to FractionalUCA.txt. 591 // It is reachable via its special contraction, but is not normally used. 592 // Find the first character tailored after the boundary CE, 593 // or the first real root CE after it. 594 if((index = nextIndexFromNode(node)) != 0) { 595 // If there is a following node, then it must be tailored 596 // because there are no root CEs with a boundary primary 597 // and non-common secondary/tertiary weights. 598 node = nodes.elementAti(index); 599 U_ASSERT(isTailoredNode(node)); 600 ce = tempCEFromIndexAndStrength(index, strength); 601 } else { 602 U_ASSERT(strength == UCOL_PRIMARY); 603 uint32_t p = (uint32_t)(ce >> 32); 604 int32_t pIndex = rootElements.findPrimary(p); 605 UBool isCompressible = baseData->isCompressiblePrimary(p); 606 p = rootElements.getPrimaryAfter(p, pIndex, isCompressible); 607 ce = Collation::makeCE(p); 608 index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode); 609 if(U_FAILURE(errorCode)) { return 0; } 610 node = nodes.elementAti(index); 611 } 612 } 613 if(nodeHasAnyBefore(node)) { 614 // Get the first node that was tailored before this one at a weaker strength. 615 if(nodeHasBefore2(node)) { 616 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 617 node = nodes.elementAti(index); 618 } 619 if(nodeHasBefore3(node)) { 620 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 621 } 622 U_ASSERT(isTailoredNode(nodes.elementAti(index))); 623 ce = tempCEFromIndexAndStrength(index, strength); 624 } 625 } else { 626 // odd pos = [last xyz] 627 // Find the last node that was tailored after the [last xyz] 628 // at a strength no greater than the position's strength. 629 for(;;) { 630 int32_t nextIndex = nextIndexFromNode(node); 631 if(nextIndex == 0) { break; } 632 int64_t nextNode = nodes.elementAti(nextIndex); 633 if(strengthFromNode(nextNode) < strength) { break; } 634 index = nextIndex; 635 node = nextNode; 636 } 637 // Do not make a temporary CE for a root node. 638 // This last node might be the node for the root CE itself, 639 // or a node with a common secondary or tertiary weight. 640 if(isTailoredNode(node)) { 641 ce = tempCEFromIndexAndStrength(index, strength); 642 } 643 } 644 return ce; 645 } 646 647 void 648 CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix, 649 const UnicodeString &str, const UnicodeString &extension, 650 const char *&parserErrorReason, UErrorCode &errorCode) { 651 if(U_FAILURE(errorCode)) { return; } 652 UnicodeString nfdPrefix; 653 if(!prefix.isEmpty()) { 654 nfd.normalize(prefix, nfdPrefix, errorCode); 655 if(U_FAILURE(errorCode)) { 656 parserErrorReason = "normalizing the relation prefix"; 657 return; 658 } 659 } 660 UnicodeString nfdString = nfd.normalize(str, errorCode); 661 if(U_FAILURE(errorCode)) { 662 parserErrorReason = "normalizing the relation string"; 663 return; 664 } 665 666 // The runtime code decomposes Hangul syllables on the fly, 667 // with recursive processing but without making the Jamo pieces visible for matching. 668 // It does not work with certain types of contextual mappings. 669 int32_t nfdLength = nfdString.length(); 670 if(nfdLength >= 2) { 671 UChar c = nfdString.charAt(0); 672 if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) { 673 // While handling a Hangul syllable, contractions starting with Jamo L or V 674 // would not see the following Jamo of that syllable. 675 errorCode = U_UNSUPPORTED_ERROR; 676 parserErrorReason = "contractions starting with conjoining Jamo L or V not supported"; 677 return; 678 } 679 c = nfdString.charAt(nfdLength - 1); 680 if(Hangul::isJamoL(c) || 681 (Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLength - 2)))) { 682 // A contraction ending with Jamo L or L+V would require 683 // generating Hangul syllables in addTailComposites() (588 for a Jamo L), 684 // or decomposing a following Hangul syllable on the fly, during contraction matching. 685 errorCode = U_UNSUPPORTED_ERROR; 686 parserErrorReason = "contractions ending with conjoining Jamo L or L+V not supported"; 687 return; 688 } 689 // A Hangul syllable completely inside a contraction is ok. 690 } 691 // Note: If there is a prefix, then the parser checked that 692 // both the prefix and the string beging with NFC boundaries (not Jamo V or T). 693 // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0)) 694 // (While handling a Hangul syllable, prefixes on Jamo V or T 695 // would not see the previous Jamo of that syllable.) 696 697 if(strength != UCOL_IDENTICAL) { 698 // Find the node index after which we insert the new tailored node. 699 int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode); 700 U_ASSERT(cesLength > 0); 701 int64_t ce = ces[cesLength - 1]; 702 if(strength == UCOL_PRIMARY && !isTempCE(ce) && (uint32_t)(ce >> 32) == 0) { 703 // There is no primary gap between ignorables and the space-first-primary. 704 errorCode = U_UNSUPPORTED_ERROR; 705 parserErrorReason = "tailoring primary after ignorables not supported"; 706 return; 707 } 708 if(strength == UCOL_QUATERNARY && ce == 0) { 709 // The CE data structure does not support non-zero quaternary weights 710 // on tertiary ignorables. 711 errorCode = U_UNSUPPORTED_ERROR; 712 parserErrorReason = "tailoring quaternary after tertiary ignorables not supported"; 713 return; 714 } 715 // Insert the new tailored node. 716 index = insertTailoredNodeAfter(index, strength, errorCode); 717 if(U_FAILURE(errorCode)) { 718 parserErrorReason = "modifying collation elements"; 719 return; 720 } 721 // Strength of the temporary CE: 722 // The new relation may yield a stronger CE but not a weaker one. 723 int32_t tempStrength = ceStrength(ce); 724 if(strength < tempStrength) { tempStrength = strength; } 725 ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength); 726 } 727 728 setCaseBits(nfdString, parserErrorReason, errorCode); 729 if(U_FAILURE(errorCode)) { return; } 730 731 int32_t cesLengthBeforeExtension = cesLength; 732 if(!extension.isEmpty()) { 733 UnicodeString nfdExtension = nfd.normalize(extension, errorCode); 734 if(U_FAILURE(errorCode)) { 735 parserErrorReason = "normalizing the relation extension"; 736 return; 737 } 738 cesLength = dataBuilder->getCEs(nfdExtension, ces, cesLength); 739 if(cesLength > Collation::MAX_EXPANSION_LENGTH) { 740 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 741 parserErrorReason = 742 "extension string adds too many collation elements (more than 31 total)"; 743 return; 744 } 745 } 746 uint32_t ce32 = Collation::UNASSIGNED_CE32; 747 if((prefix != nfdPrefix || str != nfdString) && 748 !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) { 749 // Map from the original input to the CEs. 750 // We do this in case the canonical closure is incomplete, 751 // so that it is possible to explicitly provide the missing mappings. 752 ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode); 753 } 754 addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode); 755 if(U_FAILURE(errorCode)) { 756 parserErrorReason = "writing collation elements"; 757 return; 758 } 759 cesLength = cesLengthBeforeExtension; 760 } 761 762 int32_t 763 CollationBuilder::findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason, 764 UErrorCode &errorCode) { 765 if(U_FAILURE(errorCode)) { return 0; } 766 U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_QUATERNARY); 767 768 // Find the last CE that is at least as "strong" as the requested difference. 769 // Note: Stronger is smaller (UCOL_PRIMARY=0). 770 int64_t ce; 771 for(;; --cesLength) { 772 if(cesLength == 0) { 773 ce = ces[0] = 0; 774 cesLength = 1; 775 break; 776 } else { 777 ce = ces[cesLength - 1]; 778 } 779 if(ceStrength(ce) <= strength) { break; } 780 } 781 782 if(isTempCE(ce)) { 783 // No need to findCommonNode() here for lower levels 784 // because insertTailoredNodeAfter() will do that anyway. 785 return indexFromTempCE(ce); 786 } 787 788 // root CE 789 if((uint8_t)(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) { 790 errorCode = U_UNSUPPORTED_ERROR; 791 parserErrorReason = "tailoring relative to an unassigned code point not supported"; 792 return 0; 793 } 794 return findOrInsertNodeForRootCE(ce, strength, errorCode); 795 } 796 797 int32_t 798 CollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode) { 799 if(U_FAILURE(errorCode)) { return 0; } 800 U_ASSERT((uint8_t)(ce >> 56) != Collation::UNASSIGNED_IMPLICIT_BYTE); 801 802 // Find or insert the node for each of the root CE's weights, 803 // down to the requested level/strength. 804 // Root CEs must have common=zero quaternary weights (for which we never insert any nodes). 805 U_ASSERT((ce & 0xc0) == 0); 806 int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32), errorCode); 807 if(strength >= UCOL_SECONDARY) { 808 uint32_t lower32 = (uint32_t)ce; 809 index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode); 810 if(strength >= UCOL_TERTIARY) { 811 index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIARY_MASK, 812 UCOL_TERTIARY, errorCode); 813 } 814 } 815 return index; 816 } 817 818 namespace { 819 820 /** 821 * Like Java Collections.binarySearch(List, key, Comparator). 822 * 823 * @return the index>=0 where the item was found, 824 * or the index<0 for inserting the string at ~index in sorted order 825 * (index into rootPrimaryIndexes) 826 */ 827 int32_t 828 binarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length, 829 const int64_t *nodes, uint32_t p) { 830 if(length == 0) { return ~0; } 831 int32_t start = 0; 832 int32_t limit = length; 833 for (;;) { 834 int32_t i = (start + limit) / 2; 835 int64_t node = nodes[rootPrimaryIndexes[i]]; 836 uint32_t nodePrimary = (uint32_t)(node >> 32); // weight32FromNode(node) 837 if (p == nodePrimary) { 838 return i; 839 } else if (p < nodePrimary) { 840 if (i == start) { 841 return ~start; // insert s before i 842 } 843 limit = i; 844 } else { 845 if (i == start) { 846 return ~(start + 1); // insert s after i 847 } 848 start = i; 849 } 850 } 851 } 852 853 } // namespace 854 855 int32_t 856 CollationBuilder::findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode) { 857 if(U_FAILURE(errorCode)) { return 0; } 858 859 int32_t rootIndex = binarySearchForRootPrimaryNode( 860 rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p); 861 if(rootIndex >= 0) { 862 return rootPrimaryIndexes.elementAti(rootIndex); 863 } else { 864 // Start a new list of nodes with this primary. 865 int32_t index = nodes.size(); 866 nodes.addElement(nodeFromWeight32(p), errorCode); 867 rootPrimaryIndexes.insertElementAt(index, ~rootIndex, errorCode); 868 return index; 869 } 870 } 871 872 int32_t 873 CollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) { 874 if(U_FAILURE(errorCode)) { return 0; } 875 U_ASSERT(0 <= index && index < nodes.size()); 876 U_ASSERT(UCOL_SECONDARY <= level && level <= UCOL_TERTIARY); 877 878 if(weight16 == Collation::COMMON_WEIGHT16) { 879 return findCommonNode(index, level); 880 } 881 882 // If this will be the first below-common weight for the parent node, 883 // then we will also need to insert a common weight after it. 884 int64_t node = nodes.elementAti(index); 885 U_ASSERT(strengthFromNode(node) < level); // parent node is stronger 886 if(weight16 != 0 && weight16 < Collation::COMMON_WEIGHT16) { 887 int32_t hasThisLevelBefore = level == UCOL_SECONDARY ? HAS_BEFORE2 : HAS_BEFORE3; 888 if((node & hasThisLevelBefore) == 0) { 889 // The parent node has an implied level-common weight. 890 int64_t commonNode = 891 nodeFromWeight16(Collation::COMMON_WEIGHT16) | nodeFromStrength(level); 892 if(level == UCOL_SECONDARY) { 893 // Move the HAS_BEFORE3 flag from the parent node 894 // to the new secondary common node. 895 commonNode |= node & HAS_BEFORE3; 896 node &= ~(int64_t)HAS_BEFORE3; 897 } 898 nodes.setElementAt(node | hasThisLevelBefore, index); 899 // Insert below-common-weight node. 900 int32_t nextIndex = nextIndexFromNode(node); 901 node = nodeFromWeight16(weight16) | nodeFromStrength(level); 902 index = insertNodeBetween(index, nextIndex, node, errorCode); 903 // Insert common-weight node. 904 insertNodeBetween(index, nextIndex, commonNode, errorCode); 905 // Return index of below-common-weight node. 906 return index; 907 } 908 } 909 910 // Find the root CE's weight for this level. 911 // Postpone insertion if not found: 912 // Insert the new root node before the next stronger node, 913 // or before the next root node with the same strength and a larger weight. 914 int32_t nextIndex; 915 while((nextIndex = nextIndexFromNode(node)) != 0) { 916 node = nodes.elementAti(nextIndex); 917 int32_t nextStrength = strengthFromNode(node); 918 if(nextStrength <= level) { 919 // Insert before a stronger node. 920 if(nextStrength < level) { break; } 921 // nextStrength == level 922 if(!isTailoredNode(node)) { 923 uint32_t nextWeight16 = weight16FromNode(node); 924 if(nextWeight16 == weight16) { 925 // Found the node for the root CE up to this level. 926 return nextIndex; 927 } 928 // Insert before a node with a larger same-strength weight. 929 if(nextWeight16 > weight16) { break; } 930 } 931 } 932 // Skip the next node. 933 index = nextIndex; 934 } 935 node = nodeFromWeight16(weight16) | nodeFromStrength(level); 936 return insertNodeBetween(index, nextIndex, node, errorCode); 937 } 938 939 int32_t 940 CollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode) { 941 if(U_FAILURE(errorCode)) { return 0; } 942 U_ASSERT(0 <= index && index < nodes.size()); 943 if(strength >= UCOL_SECONDARY) { 944 index = findCommonNode(index, UCOL_SECONDARY); 945 if(strength >= UCOL_TERTIARY) { 946 index = findCommonNode(index, UCOL_TERTIARY); 947 } 948 } 949 // Postpone insertion: 950 // Insert the new node before the next one with a strength at least as strong. 951 int64_t node = nodes.elementAti(index); 952 int32_t nextIndex; 953 while((nextIndex = nextIndexFromNode(node)) != 0) { 954 node = nodes.elementAti(nextIndex); 955 if(strengthFromNode(node) <= strength) { break; } 956 // Skip the next node which has a weaker (larger) strength than the new one. 957 index = nextIndex; 958 } 959 node = IS_TAILORED | nodeFromStrength(strength); 960 return insertNodeBetween(index, nextIndex, node, errorCode); 961 } 962 963 int32_t 964 CollationBuilder::insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node, 965 UErrorCode &errorCode) { 966 if(U_FAILURE(errorCode)) { return 0; } 967 U_ASSERT(previousIndexFromNode(node) == 0); 968 U_ASSERT(nextIndexFromNode(node) == 0); 969 U_ASSERT(nextIndexFromNode(nodes.elementAti(index)) == nextIndex); 970 // Append the new node and link it to the existing nodes. 971 int32_t newIndex = nodes.size(); 972 node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex); 973 nodes.addElement(node, errorCode); 974 if(U_FAILURE(errorCode)) { return 0; } 975 // nodes[index].nextIndex = newIndex 976 node = nodes.elementAti(index); 977 nodes.setElementAt(changeNodeNextIndex(node, newIndex), index); 978 // nodes[nextIndex].previousIndex = newIndex 979 if(nextIndex != 0) { 980 node = nodes.elementAti(nextIndex); 981 nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex); 982 } 983 return newIndex; 984 } 985 986 int32_t 987 CollationBuilder::findCommonNode(int32_t index, int32_t strength) const { 988 U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY); 989 int64_t node = nodes.elementAti(index); 990 if(strengthFromNode(node) >= strength) { 991 // The current node is no stronger. 992 return index; 993 } 994 if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) { 995 // The current node implies the strength-common weight. 996 return index; 997 } 998 index = nextIndexFromNode(node); 999 node = nodes.elementAti(index); 1000 U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength && 1001 weight16FromNode(node) < Collation::COMMON_WEIGHT16); 1002 // Skip to the explicit common node. 1003 do { 1004 index = nextIndexFromNode(node); 1005 node = nodes.elementAti(index); 1006 U_ASSERT(strengthFromNode(node) >= strength); 1007 } while(isTailoredNode(node) || strengthFromNode(node) > strength || 1008 weight16FromNode(node) < Collation::COMMON_WEIGHT16); 1009 U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16); 1010 return index; 1011 } 1012 1013 void 1014 CollationBuilder::setCaseBits(const UnicodeString &nfdString, 1015 const char *&parserErrorReason, UErrorCode &errorCode) { 1016 if(U_FAILURE(errorCode)) { return; } 1017 int32_t numTailoredPrimaries = 0; 1018 for(int32_t i = 0; i < cesLength; ++i) { 1019 if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; } 1020 } 1021 // We should not be able to get too many case bits because 1022 // cesLength<=31==MAX_EXPANSION_LENGTH. 1023 // 31 pairs of case bits fit into an int64_t without setting its sign bit. 1024 U_ASSERT(numTailoredPrimaries <= 31); 1025 1026 int64_t cases = 0; 1027 if(numTailoredPrimaries > 0) { 1028 const UChar *s = nfdString.getBuffer(); 1029 UTF16CollationIterator baseCEs(baseData, FALSE, s, s, s + nfdString.length()); 1030 int32_t baseCEsLength = baseCEs.fetchCEs(errorCode) - 1; 1031 if(U_FAILURE(errorCode)) { 1032 parserErrorReason = "fetching root CEs for tailored string"; 1033 return; 1034 } 1035 U_ASSERT(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation::NO_CE); 1036 1037 uint32_t lastCase = 0; 1038 int32_t numBasePrimaries = 0; 1039 for(int32_t i = 0; i < baseCEsLength; ++i) { 1040 int64_t ce = baseCEs.getCE(i); 1041 if((ce >> 32) != 0) { 1042 ++numBasePrimaries; 1043 uint32_t c = ((uint32_t)ce >> 14) & 3; 1044 U_ASSERT(c == 0 || c == 2); // lowercase or uppercase, no mixed case in any base CE 1045 if(numBasePrimaries < numTailoredPrimaries) { 1046 cases |= (int64_t)c << ((numBasePrimaries - 1) * 2); 1047 } else if(numBasePrimaries == numTailoredPrimaries) { 1048 lastCase = c; 1049 } else if(c != lastCase) { 1050 // There are more base primary CEs than tailored primaries. 1051 // Set mixed case if the case bits of the remainder differ. 1052 lastCase = 1; 1053 // Nothing more can change. 1054 break; 1055 } 1056 } 1057 } 1058 if(numBasePrimaries >= numTailoredPrimaries) { 1059 cases |= (int64_t)lastCase << ((numTailoredPrimaries - 1) * 2); 1060 } 1061 } 1062 1063 for(int32_t i = 0; i < cesLength; ++i) { 1064 int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff); // clear old case bits 1065 int32_t strength = ceStrength(ce); 1066 if(strength == UCOL_PRIMARY) { 1067 ce |= (cases & 3) << 14; 1068 cases >>= 2; 1069 } else if(strength == UCOL_TERTIARY) { 1070 // Tertiary CEs must have uppercase bits. 1071 // See the LDML spec, and comments in class CollationCompare. 1072 ce |= 0x8000; 1073 } 1074 // Tertiary ignorable CEs must have 0 case bits. 1075 // We set 0 case bits for secondary CEs too 1076 // since currently only U+0345 is cased and maps to a secondary CE, 1077 // and it is lowercase. Other secondaries are uncased. 1078 // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight. 1079 ces[i] = ce; 1080 } 1081 } 1082 1083 void 1084 CollationBuilder::suppressContractions(const UnicodeSet &set, const char *&parserErrorReason, 1085 UErrorCode &errorCode) { 1086 if(U_FAILURE(errorCode)) { return; } 1087 dataBuilder->suppressContractions(set, errorCode); 1088 if(U_FAILURE(errorCode)) { 1089 parserErrorReason = "application of [suppressContractions [set]] failed"; 1090 } 1091 } 1092 1093 void 1094 CollationBuilder::optimize(const UnicodeSet &set, const char *& /* parserErrorReason */, 1095 UErrorCode &errorCode) { 1096 if(U_FAILURE(errorCode)) { return; } 1097 optimizeSet.addAll(set); 1098 } 1099 1100 uint32_t 1101 CollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, 1102 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, 1103 UErrorCode &errorCode) { 1104 // Map from the NFD input to the CEs. 1105 ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode); 1106 ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode); 1107 addTailComposites(nfdPrefix, nfdString, errorCode); 1108 return ce32; 1109 } 1110 1111 uint32_t 1112 CollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, 1113 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, 1114 UErrorCode &errorCode) { 1115 if(U_FAILURE(errorCode)) { return ce32; } 1116 1117 // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.) 1118 if(nfdPrefix.isEmpty()) { 1119 CanonicalIterator stringIter(nfdString, errorCode); 1120 if(U_FAILURE(errorCode)) { return ce32; } 1121 UnicodeString prefix; 1122 for(;;) { 1123 UnicodeString str = stringIter.next(); 1124 if(str.isBogus()) { break; } 1125 if(ignoreString(str, errorCode) || str == nfdString) { continue; } 1126 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode); 1127 if(U_FAILURE(errorCode)) { return ce32; } 1128 } 1129 } else { 1130 CanonicalIterator prefixIter(nfdPrefix, errorCode); 1131 CanonicalIterator stringIter(nfdString, errorCode); 1132 if(U_FAILURE(errorCode)) { return ce32; } 1133 for(;;) { 1134 UnicodeString prefix = prefixIter.next(); 1135 if(prefix.isBogus()) { break; } 1136 if(ignorePrefix(prefix, errorCode)) { continue; } 1137 UBool samePrefix = prefix == nfdPrefix; 1138 for(;;) { 1139 UnicodeString str = stringIter.next(); 1140 if(str.isBogus()) { break; } 1141 if(ignoreString(str, errorCode) || (samePrefix && str == nfdString)) { continue; } 1142 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode); 1143 if(U_FAILURE(errorCode)) { return ce32; } 1144 } 1145 stringIter.reset(); 1146 } 1147 } 1148 return ce32; 1149 } 1150 1151 void 1152 CollationBuilder::addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, 1153 UErrorCode &errorCode) { 1154 if(U_FAILURE(errorCode)) { return; } 1155 1156 // Look for the last starter in the NFD string. 1157 UChar32 lastStarter; 1158 int32_t indexAfterLastStarter = nfdString.length(); 1159 for(;;) { 1160 if(indexAfterLastStarter == 0) { return; } // no starter at all 1161 lastStarter = nfdString.char32At(indexAfterLastStarter - 1); 1162 if(nfd.getCombiningClass(lastStarter) == 0) { break; } 1163 indexAfterLastStarter -= U16_LENGTH(lastStarter); 1164 } 1165 // No closure to Hangul syllables since we decompose them on the fly. 1166 if(Hangul::isJamoL(lastStarter)) { return; } 1167 1168 // Are there any composites whose decomposition starts with the lastStarter? 1169 // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters. 1170 // We might find some more equivalent mappings here if it did. 1171 UnicodeSet composites; 1172 if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; } 1173 1174 UnicodeString decomp; 1175 UnicodeString newNFDString, newString; 1176 int64_t newCEs[Collation::MAX_EXPANSION_LENGTH]; 1177 UnicodeSetIterator iter(composites); 1178 while(iter.next()) { 1179 U_ASSERT(!iter.isString()); 1180 UChar32 composite = iter.getCodepoint(); 1181 nfd.getDecomposition(composite, decomp); 1182 if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp, 1183 newNFDString, newString, errorCode)) { 1184 continue; 1185 } 1186 int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newCEs, 0); 1187 if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) { 1188 // Ignore mappings that we cannot store. 1189 continue; 1190 } 1191 // Note: It is possible that the newCEs do not make use of the mapping 1192 // for which we are adding the tail composites, in which case we might be adding 1193 // unnecessary mappings. 1194 // For example, when we add tail composites for ae^ (^=combining circumflex), 1195 // UCA discontiguous-contraction matching does not find any matches 1196 // for ae_^ (_=any combining diacritic below) *unless* there is also 1197 // a contraction mapping for ae. 1198 // Thus, if there is no ae contraction, then the ae^ mapping is ignored 1199 // while fetching the newCEs for ae_^. 1200 // TODO: Try to detect this effectively. 1201 // (Alternatively, print a warning when prefix contractions are missing.) 1202 1203 // We do not need an explicit mapping for the NFD strings. 1204 // It is fine if the NFD input collates like this via a sequence of mappings. 1205 // It also saves a little bit of space, and may reduce the set of characters with contractions. 1206 uint32_t ce32 = addIfDifferent(nfdPrefix, newString, 1207 newCEs, newCEsLength, Collation::UNASSIGNED_CE32, errorCode); 1208 if(ce32 != Collation::UNASSIGNED_CE32) { 1209 // was different, was added 1210 addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32, errorCode); 1211 } 1212 } 1213 } 1214 1215 UBool 1216 CollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString, 1217 int32_t indexAfterLastStarter, 1218 UChar32 composite, const UnicodeString &decomp, 1219 UnicodeString &newNFDString, UnicodeString &newString, 1220 UErrorCode &errorCode) const { 1221 if(U_FAILURE(errorCode)) { return FALSE; } 1222 U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0)); 1223 int32_t lastStarterLength = decomp.moveIndex32(0, 1); 1224 if(lastStarterLength == decomp.length()) { 1225 // Singleton decompositions should be found by addWithClosure() 1226 // and the CanonicalIterator, so we can ignore them here. 1227 return FALSE; 1228 } 1229 if(nfdString.compare(indexAfterLastStarter, 0x7fffffff, 1230 decomp, lastStarterLength, 0x7fffffff) == 0) { 1231 // same strings, nothing new to be found here 1232 return FALSE; 1233 } 1234 1235 // Make new FCD strings that combine a composite, or its decomposition, 1236 // into the nfdString's last starter and the combining marks following it. 1237 // Make an NFD version, and a version with the composite. 1238 newNFDString.setTo(nfdString, 0, indexAfterLastStarter); 1239 newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).append(composite); 1240 1241 // The following is related to discontiguous contraction matching, 1242 // but builds only FCD strings (or else returns FALSE). 1243 int32_t sourceIndex = indexAfterLastStarter; 1244 int32_t decompIndex = lastStarterLength; 1245 // Small optimization: We keep the source character across loop iterations 1246 // because we do not always consume it, 1247 // and then need not fetch it again nor look up its combining class again. 1248 UChar32 sourceChar = U_SENTINEL; 1249 // The cc variables need to be declared before the loop so that at the end 1250 // they are set to the last combining classes seen. 1251 uint8_t sourceCC = 0; 1252 uint8_t decompCC = 0; 1253 for(;;) { 1254 if(sourceChar < 0) { 1255 if(sourceIndex >= nfdString.length()) { break; } 1256 sourceChar = nfdString.char32At(sourceIndex); 1257 sourceCC = nfd.getCombiningClass(sourceChar); 1258 U_ASSERT(sourceCC != 0); 1259 } 1260 // We consume a decomposition character in each iteration. 1261 if(decompIndex >= decomp.length()) { break; } 1262 UChar32 decompChar = decomp.char32At(decompIndex); 1263 decompCC = nfd.getCombiningClass(decompChar); 1264 // Compare the two characters and their combining classes. 1265 if(decompCC == 0) { 1266 // Unable to merge because the source contains a non-zero combining mark 1267 // but the composite's decomposition contains another starter. 1268 // The strings would not be equivalent. 1269 return FALSE; 1270 } else if(sourceCC < decompCC) { 1271 // Composite + sourceChar would not be FCD. 1272 return FALSE; 1273 } else if(decompCC < sourceCC) { 1274 newNFDString.append(decompChar); 1275 decompIndex += U16_LENGTH(decompChar); 1276 } else if(decompChar != sourceChar) { 1277 // Blocked because same combining class. 1278 return FALSE; 1279 } else { // match: decompChar == sourceChar 1280 newNFDString.append(decompChar); 1281 decompIndex += U16_LENGTH(decompChar); 1282 sourceIndex += U16_LENGTH(decompChar); 1283 sourceChar = U_SENTINEL; 1284 } 1285 } 1286 // We are at the end of at least one of the two inputs. 1287 if(sourceChar >= 0) { // more characters from nfdString but not from decomp 1288 if(sourceCC < decompCC) { 1289 // Appending the next source character to the composite would not be FCD. 1290 return FALSE; 1291 } 1292 newNFDString.append(nfdString, sourceIndex, 0x7fffffff); 1293 newString.append(nfdString, sourceIndex, 0x7fffffff); 1294 } else if(decompIndex < decomp.length()) { // more characters from decomp, not from nfdString 1295 newNFDString.append(decomp, decompIndex, 0x7fffffff); 1296 } 1297 U_ASSERT(nfd.isNormalized(newNFDString, errorCode)); 1298 U_ASSERT(fcd.isNormalized(newString, errorCode)); 1299 U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString); // canonically equivalent 1300 return TRUE; 1301 } 1302 1303 UBool 1304 CollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const { 1305 // Do not map non-FCD prefixes. 1306 return !isFCD(s, errorCode); 1307 } 1308 1309 UBool 1310 CollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) const { 1311 // Do not map non-FCD strings. 1312 // Do not map strings that start with Hangul syllables: We decompose those on the fly. 1313 return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0)); 1314 } 1315 1316 UBool 1317 CollationBuilder::isFCD(const UnicodeString &s, UErrorCode &errorCode) const { 1318 return U_SUCCESS(errorCode) && fcd.isNormalized(s, errorCode); 1319 } 1320 1321 void 1322 CollationBuilder::closeOverComposites(UErrorCode &errorCode) { 1323 UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode); // Java: static final 1324 if(U_FAILURE(errorCode)) { return; } 1325 // Hangul is decomposed on the fly during collation. 1326 composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END); 1327 UnicodeString prefix; // empty 1328 UnicodeString nfdString; 1329 UnicodeSetIterator iter(composites); 1330 while(iter.next()) { 1331 U_ASSERT(!iter.isString()); 1332 nfd.getDecomposition(iter.getCodepoint(), nfdString); 1333 cesLength = dataBuilder->getCEs(nfdString, ces, 0); 1334 if(cesLength > Collation::MAX_EXPANSION_LENGTH) { 1335 // Too many CEs from the decomposition (unusual), ignore this composite. 1336 // We could add a capacity parameter to getCEs() and reallocate if necessary. 1337 // However, this can only really happen in contrived cases. 1338 continue; 1339 } 1340 const UnicodeString &composite(iter.getString()); 1341 addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_CE32, errorCode); 1342 } 1343 } 1344 1345 uint32_t 1346 CollationBuilder::addIfDifferent(const UnicodeString &prefix, const UnicodeString &str, 1347 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, 1348 UErrorCode &errorCode) { 1349 if(U_FAILURE(errorCode)) { return ce32; } 1350 int64_t oldCEs[Collation::MAX_EXPANSION_LENGTH]; 1351 int32_t oldCEsLength = dataBuilder->getCEs(prefix, str, oldCEs, 0); 1352 if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) { 1353 if(ce32 == Collation::UNASSIGNED_CE32) { 1354 ce32 = dataBuilder->encodeCEs(newCEs, newCEsLength, errorCode); 1355 } 1356 dataBuilder->addCE32(prefix, str, ce32, errorCode); 1357 } 1358 return ce32; 1359 } 1360 1361 UBool 1362 CollationBuilder::sameCEs(const int64_t ces1[], int32_t ces1Length, 1363 const int64_t ces2[], int32_t ces2Length) { 1364 if(ces1Length != ces2Length) { 1365 return FALSE; 1366 } 1367 U_ASSERT(ces1Length <= Collation::MAX_EXPANSION_LENGTH); 1368 for(int32_t i = 0; i < ces1Length; ++i) { 1369 if(ces1[i] != ces2[i]) { return FALSE; } 1370 } 1371 return TRUE; 1372 } 1373 1374 #ifdef DEBUG_COLLATION_BUILDER 1375 1376 uint32_t 1377 alignWeightRight(uint32_t w) { 1378 if(w != 0) { 1379 while((w & 0xff) == 0) { w >>= 8; } 1380 } 1381 return w; 1382 } 1383 1384 #endif 1385 1386 void 1387 CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) { 1388 if(U_FAILURE(errorCode)) { return; } 1389 1390 CollationWeights primaries, secondaries, tertiaries; 1391 int64_t *nodesArray = nodes.getBuffer(); 1392 #ifdef DEBUG_COLLATION_BUILDER 1393 puts("\nCollationBuilder::makeTailoredCEs()"); 1394 #endif 1395 1396 for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) { 1397 int32_t i = rootPrimaryIndexes.elementAti(rpi); 1398 int64_t node = nodesArray[i]; 1399 uint32_t p = weight32FromNode(node); 1400 uint32_t s = p == 0 ? 0 : Collation::COMMON_WEIGHT16; 1401 uint32_t t = s; 1402 uint32_t q = 0; 1403 UBool pIsTailored = FALSE; 1404 UBool sIsTailored = FALSE; 1405 UBool tIsTailored = FALSE; 1406 #ifdef DEBUG_COLLATION_BUILDER 1407 printf("\nprimary %lx\n", (long)alignWeightRight(p)); 1408 #endif 1409 int32_t pIndex = p == 0 ? 0 : rootElements.findPrimary(p); 1410 int32_t nextIndex = nextIndexFromNode(node); 1411 while(nextIndex != 0) { 1412 i = nextIndex; 1413 node = nodesArray[i]; 1414 nextIndex = nextIndexFromNode(node); 1415 int32_t strength = strengthFromNode(node); 1416 if(strength == UCOL_QUATERNARY) { 1417 U_ASSERT(isTailoredNode(node)); 1418 #ifdef DEBUG_COLLATION_BUILDER 1419 printf(" quat+ "); 1420 #endif 1421 if(q == 3) { 1422 errorCode = U_BUFFER_OVERFLOW_ERROR; 1423 errorReason = "quaternary tailoring gap too small"; 1424 return; 1425 } 1426 ++q; 1427 } else { 1428 if(strength == UCOL_TERTIARY) { 1429 if(isTailoredNode(node)) { 1430 #ifdef DEBUG_COLLATION_BUILDER 1431 printf(" ter+ "); 1432 #endif 1433 if(!tIsTailored) { 1434 // First tailored tertiary node for [p, s]. 1435 int32_t tCount = countTailoredNodes(nodesArray, nextIndex, 1436 UCOL_TERTIARY) + 1; 1437 uint32_t tLimit; 1438 if(t == 0) { 1439 // Gap at the beginning of the tertiary CE range. 1440 t = rootElements.getTertiaryBoundary() - 0x100; 1441 tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK; 1442 } else if(!pIsTailored && !sIsTailored) { 1443 // p and s are root weights. 1444 tLimit = rootElements.getTertiaryAfter(pIndex, s, t); 1445 } else if(t == Collation::BEFORE_WEIGHT16) { 1446 tLimit = Collation::COMMON_WEIGHT16; 1447 } else { 1448 // [p, s] is tailored. 1449 U_ASSERT(t == Collation::COMMON_WEIGHT16); 1450 tLimit = rootElements.getTertiaryBoundary(); 1451 } 1452 U_ASSERT(tLimit == 0x4000 || (tLimit & ~Collation::ONLY_TERTIARY_MASK) == 0); 1453 tertiaries.initForTertiary(); 1454 if(!tertiaries.allocWeights(t, tLimit, tCount)) { 1455 errorCode = U_BUFFER_OVERFLOW_ERROR; 1456 errorReason = "tertiary tailoring gap too small"; 1457 return; 1458 } 1459 tIsTailored = TRUE; 1460 } 1461 t = tertiaries.nextWeight(); 1462 U_ASSERT(t != 0xffffffff); 1463 } else { 1464 t = weight16FromNode(node); 1465 tIsTailored = FALSE; 1466 #ifdef DEBUG_COLLATION_BUILDER 1467 printf(" ter %lx\n", (long)alignWeightRight(t)); 1468 #endif 1469 } 1470 } else { 1471 if(strength == UCOL_SECONDARY) { 1472 if(isTailoredNode(node)) { 1473 #ifdef DEBUG_COLLATION_BUILDER 1474 printf(" sec+ "); 1475 #endif 1476 if(!sIsTailored) { 1477 // First tailored secondary node for p. 1478 int32_t sCount = countTailoredNodes(nodesArray, nextIndex, 1479 UCOL_SECONDARY) + 1; 1480 uint32_t sLimit; 1481 if(s == 0) { 1482 // Gap at the beginning of the secondary CE range. 1483 s = rootElements.getSecondaryBoundary() - 0x100; 1484 sLimit = rootElements.getFirstSecondaryCE() >> 16; 1485 } else if(!pIsTailored) { 1486 // p is a root primary. 1487 sLimit = rootElements.getSecondaryAfter(pIndex, s); 1488 } else if(s == Collation::BEFORE_WEIGHT16) { 1489 sLimit = Collation::COMMON_WEIGHT16; 1490 } else { 1491 // p is a tailored primary. 1492 U_ASSERT(s == Collation::COMMON_WEIGHT16); 1493 sLimit = rootElements.getSecondaryBoundary(); 1494 } 1495 if(s == Collation::COMMON_WEIGHT16) { 1496 // Do not tailor into the getSortKey() range of 1497 // compressed common secondaries. 1498 s = rootElements.getLastCommonSecondary(); 1499 } 1500 secondaries.initForSecondary(); 1501 if(!secondaries.allocWeights(s, sLimit, sCount)) { 1502 errorCode = U_BUFFER_OVERFLOW_ERROR; 1503 errorReason = "secondary tailoring gap too small"; 1504 #ifdef DEBUG_COLLATION_BUILDER 1505 printf("!secondaries.allocWeights(%lx, %lx, sCount=%ld)\n", 1506 (long)alignWeightRight(s), (long)alignWeightRight(sLimit), 1507 (long)alignWeightRight(sCount)); 1508 #endif 1509 return; 1510 } 1511 sIsTailored = TRUE; 1512 } 1513 s = secondaries.nextWeight(); 1514 U_ASSERT(s != 0xffffffff); 1515 } else { 1516 s = weight16FromNode(node); 1517 sIsTailored = FALSE; 1518 #ifdef DEBUG_COLLATION_BUILDER 1519 printf(" sec %lx\n", (long)alignWeightRight(s)); 1520 #endif 1521 } 1522 } else /* UCOL_PRIMARY */ { 1523 U_ASSERT(isTailoredNode(node)); 1524 #ifdef DEBUG_COLLATION_BUILDER 1525 printf("pri+ "); 1526 #endif 1527 if(!pIsTailored) { 1528 // First tailored primary node in this list. 1529 int32_t pCount = countTailoredNodes(nodesArray, nextIndex, 1530 UCOL_PRIMARY) + 1; 1531 UBool isCompressible = baseData->isCompressiblePrimary(p); 1532 uint32_t pLimit = 1533 rootElements.getPrimaryAfter(p, pIndex, isCompressible); 1534 primaries.initForPrimary(isCompressible); 1535 if(!primaries.allocWeights(p, pLimit, pCount)) { 1536 errorCode = U_BUFFER_OVERFLOW_ERROR; // TODO: introduce a more specific UErrorCode? 1537 errorReason = "primary tailoring gap too small"; 1538 return; 1539 } 1540 pIsTailored = TRUE; 1541 } 1542 p = primaries.nextWeight(); 1543 U_ASSERT(p != 0xffffffff); 1544 s = Collation::COMMON_WEIGHT16; 1545 sIsTailored = FALSE; 1546 } 1547 t = s == 0 ? 0 : Collation::COMMON_WEIGHT16; 1548 tIsTailored = FALSE; 1549 } 1550 q = 0; 1551 } 1552 if(isTailoredNode(node)) { 1553 nodesArray[i] = Collation::makeCE(p, s, t, q); 1554 #ifdef DEBUG_COLLATION_BUILDER 1555 printf("%016llx\n", (long long)nodesArray[i]); 1556 #endif 1557 } 1558 } 1559 } 1560 } 1561 1562 int32_t 1563 CollationBuilder::countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength) { 1564 int32_t count = 0; 1565 for(;;) { 1566 if(i == 0) { break; } 1567 int64_t node = nodesArray[i]; 1568 if(strengthFromNode(node) < strength) { break; } 1569 if(strengthFromNode(node) == strength) { 1570 if(isTailoredNode(node)) { 1571 ++count; 1572 } else { 1573 break; 1574 } 1575 } 1576 i = nextIndexFromNode(node); 1577 } 1578 return count; 1579 } 1580 1581 class CEFinalizer : public CollationDataBuilder::CEModifier { 1582 public: 1583 CEFinalizer(const int64_t *ces) : finalCEs(ces) {} 1584 virtual ~CEFinalizer(); 1585 virtual int64_t modifyCE32(uint32_t ce32) const { 1586 U_ASSERT(!Collation::isSpecialCE32(ce32)); 1587 if(CollationBuilder::isTempCE32(ce32)) { 1588 // retain case bits 1589 return finalCEs[CollationBuilder::indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8); 1590 } else { 1591 return Collation::NO_CE; 1592 } 1593 } 1594 virtual int64_t modifyCE(int64_t ce) const { 1595 if(CollationBuilder::isTempCE(ce)) { 1596 // retain case bits 1597 return finalCEs[CollationBuilder::indexFromTempCE(ce)] | (ce & 0xc000); 1598 } else { 1599 return Collation::NO_CE; 1600 } 1601 } 1602 1603 private: 1604 const int64_t *finalCEs; 1605 }; 1606 1607 CEFinalizer::~CEFinalizer() {} 1608 1609 void 1610 CollationBuilder::finalizeCEs(UErrorCode &errorCode) { 1611 if(U_FAILURE(errorCode)) { return; } 1612 LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(errorCode), errorCode); 1613 if(U_FAILURE(errorCode)) { 1614 return; 1615 } 1616 newBuilder->initForTailoring(baseData, errorCode); 1617 CEFinalizer finalizer(nodes.getBuffer()); 1618 newBuilder->copyFrom(*dataBuilder, finalizer, errorCode); 1619 if(U_FAILURE(errorCode)) { return; } 1620 delete dataBuilder; 1621 dataBuilder = newBuilder.orphan(); 1622 } 1623 1624 int32_t 1625 CollationBuilder::ceStrength(int64_t ce) { 1626 return 1627 isTempCE(ce) ? strengthFromTempCE(ce) : 1628 (ce & INT64_C(0xff00000000000000)) != 0 ? UCOL_PRIMARY : 1629 ((uint32_t)ce & 0xff000000) != 0 ? UCOL_SECONDARY : 1630 ce != 0 ? UCOL_TERTIARY : 1631 UCOL_IDENTICAL; 1632 } 1633 1634 U_NAMESPACE_END 1635 1636 U_NAMESPACE_USE 1637 1638 U_CAPI UCollator * U_EXPORT2 1639 ucol_openRules(const UChar *rules, int32_t rulesLength, 1640 UColAttributeValue normalizationMode, UCollationStrength strength, 1641 UParseError *parseError, UErrorCode *pErrorCode) { 1642 if(U_FAILURE(*pErrorCode)) { return NULL; } 1643 if(rules == NULL && rulesLength != 0) { 1644 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1645 return NULL; 1646 } 1647 RuleBasedCollator *coll = new RuleBasedCollator(); 1648 if(coll == NULL) { 1649 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; 1650 return NULL; 1651 } 1652 UnicodeString r((UBool)(rulesLength < 0), rules, rulesLength); 1653 coll->internalBuildTailoring(r, strength, normalizationMode, parseError, NULL, *pErrorCode); 1654 if(U_FAILURE(*pErrorCode)) { 1655 delete coll; 1656 return NULL; 1657 } 1658 return coll->toUCollator(); 1659 } 1660 1661 static const int32_t internalBufferSize = 512; 1662 1663 // The @internal ucol_getUnsafeSet() was moved here from ucol_sit.cpp 1664 // because it calls UnicodeSet "builder" code that depends on all Unicode properties, 1665 // and the rest of the collation "runtime" code only depends on normalization. 1666 // This function is not related to the collation builder, 1667 // but it did not seem worth moving it into its own .cpp file, 1668 // nor rewriting it to use lower-level UnicodeSet and Normalizer2Impl methods. 1669 U_CAPI int32_t U_EXPORT2 1670 ucol_getUnsafeSet( const UCollator *coll, 1671 USet *unsafe, 1672 UErrorCode *status) 1673 { 1674 UChar buffer[internalBufferSize]; 1675 int32_t len = 0; 1676 1677 uset_clear(unsafe); 1678 1679 // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant 1680 static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 1681 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 }; 1682 1683 // add chars that fail the fcd check 1684 uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status); 1685 1686 // add lead/trail surrogates 1687 // (trail surrogates should need to be unsafe only if the caller tests for UTF-16 code *units*, 1688 // not when testing code *points*) 1689 uset_addRange(unsafe, 0xd800, 0xdfff); 1690 1691 USet *contractions = uset_open(0,0); 1692 1693 int32_t i = 0, j = 0; 1694 ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status); 1695 int32_t contsSize = uset_size(contractions); 1696 UChar32 c = 0; 1697 // Contraction set consists only of strings 1698 // to get unsafe code points, we need to 1699 // break the strings apart and add them to the unsafe set 1700 for(i = 0; i < contsSize; i++) { 1701 len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status); 1702 if(len > 0) { 1703 j = 0; 1704 while(j < len) { 1705 U16_NEXT(buffer, j, len, c); 1706 if(j < len) { 1707 uset_add(unsafe, c); 1708 } 1709 } 1710 } 1711 } 1712 1713 uset_close(contractions); 1714 1715 return uset_size(unsafe); 1716 } 1717 1718 #endif // !UCONFIG_NO_COLLATION 1719