1 /* 2 ******************************************************************************* 3 * Copyright (C) 2013-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * collationbuilder.cpp 7 * 8 * (replaced the former ucol_bld.cpp) 9 * 10 * created on: 2013may06 11 * created by: Markus W. Scherer 12 */ 13 14 #ifdef DEBUG_COLLATION_BUILDER 15 #include <stdio.h> 16 #endif 17 18 #include "unicode/utypes.h" 19 20 #if !UCONFIG_NO_COLLATION 21 22 #include "unicode/caniter.h" 23 #include "unicode/normalizer2.h" 24 #include "unicode/tblcoll.h" 25 #include "unicode/parseerr.h" 26 #include "unicode/uchar.h" 27 #include "unicode/ucol.h" 28 #include "unicode/unistr.h" 29 #include "unicode/usetiter.h" 30 #include "unicode/utf16.h" 31 #include "unicode/uversion.h" 32 #include "cmemory.h" 33 #include "collation.h" 34 #include "collationbuilder.h" 35 #include "collationdata.h" 36 #include "collationdatabuilder.h" 37 #include "collationfastlatin.h" 38 #include "collationroot.h" 39 #include "collationrootelements.h" 40 #include "collationruleparser.h" 41 #include "collationsettings.h" 42 #include "collationtailoring.h" 43 #include "collationweights.h" 44 #include "normalizer2impl.h" 45 #include "uassert.h" 46 #include "ucol_imp.h" 47 #include "utf16collationiterator.h" 48 49 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 50 51 U_NAMESPACE_BEGIN 52 53 namespace { 54 55 class BundleImporter : public CollationRuleParser::Importer { 56 public: 57 BundleImporter() : rules(NULL) {} 58 virtual ~BundleImporter(); 59 virtual const UnicodeString *getRules( 60 const char *localeID, const char *collationType, 61 const char *&errorReason, UErrorCode &errorCode); 62 63 private: 64 UnicodeString *rules; 65 }; 66 67 BundleImporter::~BundleImporter() { 68 delete rules; 69 } 70 71 const UnicodeString * 72 BundleImporter::getRules( 73 const char *localeID, const char *collationType, 74 const char *& /*errorReason*/, UErrorCode &errorCode) { 75 delete rules; 76 return rules = CollationLoader::loadRules(localeID, collationType, errorCode); 77 } 78 79 } // namespace 80 81 // RuleBasedCollator implementation ---------------------------------------- *** 82 83 // These methods are here, rather than in rulebasedcollator.cpp, 84 // for modularization: 85 // Most code using Collator does not need to build a Collator from rules. 86 // By moving these constructors and helper methods to a separate file, 87 // most code will not have a static dependency on the builder code. 88 89 RuleBasedCollator::RuleBasedCollator() 90 : data(NULL), 91 settings(NULL), 92 tailoring(NULL), 93 validLocale(""), 94 explicitlySetAttributes(0), 95 actualLocaleIsSameAsValid(FALSE) { 96 } 97 98 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, UErrorCode &errorCode) 99 : data(NULL), 100 settings(NULL), 101 tailoring(NULL), 102 validLocale(""), 103 explicitlySetAttributes(0), 104 actualLocaleIsSameAsValid(FALSE) { 105 internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, NULL, NULL, errorCode); 106 } 107 108 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, ECollationStrength strength, 109 UErrorCode &errorCode) 110 : data(NULL), 111 settings(NULL), 112 tailoring(NULL), 113 validLocale(""), 114 explicitlySetAttributes(0), 115 actualLocaleIsSameAsValid(FALSE) { 116 internalBuildTailoring(rules, strength, UCOL_DEFAULT, NULL, NULL, errorCode); 117 } 118 119 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, 120 UColAttributeValue decompositionMode, 121 UErrorCode &errorCode) 122 : data(NULL), 123 settings(NULL), 124 tailoring(NULL), 125 validLocale(""), 126 explicitlySetAttributes(0), 127 actualLocaleIsSameAsValid(FALSE) { 128 internalBuildTailoring(rules, UCOL_DEFAULT, decompositionMode, NULL, NULL, errorCode); 129 } 130 131 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, 132 ECollationStrength strength, 133 UColAttributeValue decompositionMode, 134 UErrorCode &errorCode) 135 : data(NULL), 136 settings(NULL), 137 tailoring(NULL), 138 validLocale(""), 139 explicitlySetAttributes(0), 140 actualLocaleIsSameAsValid(FALSE) { 141 internalBuildTailoring(rules, strength, decompositionMode, NULL, NULL, errorCode); 142 } 143 144 RuleBasedCollator::RuleBasedCollator(const UnicodeString &rules, 145 UParseError &parseError, UnicodeString &reason, 146 UErrorCode &errorCode) 147 : data(NULL), 148 settings(NULL), 149 tailoring(NULL), 150 validLocale(""), 151 explicitlySetAttributes(0), 152 actualLocaleIsSameAsValid(FALSE) { 153 internalBuildTailoring(rules, UCOL_DEFAULT, UCOL_DEFAULT, &parseError, &reason, errorCode); 154 } 155 156 void 157 RuleBasedCollator::internalBuildTailoring(const UnicodeString &rules, 158 int32_t strength, 159 UColAttributeValue decompositionMode, 160 UParseError *outParseError, UnicodeString *outReason, 161 UErrorCode &errorCode) { 162 const CollationTailoring *base = CollationRoot::getRoot(errorCode); 163 if(U_FAILURE(errorCode)) { return; } 164 if(outReason != NULL) { outReason->remove(); } 165 CollationBuilder builder(base, errorCode); 166 UVersionInfo noVersion = { 0, 0, 0, 0 }; 167 BundleImporter importer; 168 LocalPointer<CollationTailoring> t(builder.parseAndBuild(rules, noVersion, 169 &importer, 170 outParseError, errorCode)); 171 if(U_FAILURE(errorCode)) { 172 const char *reason = builder.getErrorReason(); 173 if(reason != NULL && outReason != NULL) { 174 *outReason = UnicodeString(reason, -1, US_INV); 175 } 176 return; 177 } 178 t->actualLocale.setToBogus(); 179 adoptTailoring(t.orphan()); 180 // Set attributes after building the collator, 181 // to keep the default settings consistent with the rule string. 182 if(strength != UCOL_DEFAULT) { 183 setAttribute(UCOL_STRENGTH, (UColAttributeValue)strength, errorCode); 184 } 185 if(decompositionMode != UCOL_DEFAULT) { 186 setAttribute(UCOL_NORMALIZATION_MODE, decompositionMode, errorCode); 187 } 188 } 189 190 // CollationBuilder implementation ----------------------------------------- *** 191 192 CollationBuilder::CollationBuilder(const CollationTailoring *b, UErrorCode &errorCode) 193 : nfd(*Normalizer2::getNFDInstance(errorCode)), 194 fcd(*Normalizer2Factory::getFCDInstance(errorCode)), 195 nfcImpl(*Normalizer2Factory::getNFCImpl(errorCode)), 196 base(b), 197 baseData(b->data), 198 rootElements(b->data->rootElements, b->data->rootElementsLength), 199 variableTop(0), 200 dataBuilder(new CollationDataBuilder(errorCode)), fastLatinEnabled(TRUE), 201 errorReason(NULL), 202 cesLength(0), 203 rootPrimaryIndexes(errorCode), nodes(errorCode) { 204 nfcImpl.ensureCanonIterData(errorCode); 205 if(U_FAILURE(errorCode)) { 206 errorReason = "CollationBuilder fields initialization failed"; 207 return; 208 } 209 if(dataBuilder == NULL) { 210 errorCode = U_MEMORY_ALLOCATION_ERROR; 211 return; 212 } 213 dataBuilder->initForTailoring(baseData, errorCode); 214 if(U_FAILURE(errorCode)) { 215 errorReason = "CollationBuilder initialization failed"; 216 } 217 } 218 219 CollationBuilder::~CollationBuilder() { 220 delete dataBuilder; 221 } 222 223 CollationTailoring * 224 CollationBuilder::parseAndBuild(const UnicodeString &ruleString, 225 const UVersionInfo rulesVersion, 226 CollationRuleParser::Importer *importer, 227 UParseError *outParseError, 228 UErrorCode &errorCode) { 229 if(U_FAILURE(errorCode)) { return NULL; } 230 if(baseData->rootElements == NULL) { 231 errorCode = U_MISSING_RESOURCE_ERROR; 232 errorReason = "missing root elements data, tailoring not supported"; 233 return NULL; 234 } 235 LocalPointer<CollationTailoring> tailoring(new CollationTailoring(base->settings)); 236 if(tailoring.isNull() || tailoring->isBogus()) { 237 errorCode = U_MEMORY_ALLOCATION_ERROR; 238 return NULL; 239 } 240 CollationRuleParser parser(baseData, errorCode); 241 if(U_FAILURE(errorCode)) { return NULL; } 242 // Note: This always bases &[last variable] and &[first regular] 243 // on the root collator's maxVariable/variableTop. 244 // If we wanted this to change after [maxVariable x], then we would keep 245 // the tailoring.settings pointer here and read its variableTop when we need it. 246 // See http://unicode.org/cldr/trac/ticket/6070 247 variableTop = base->settings->variableTop; 248 parser.setSink(this); 249 parser.setImporter(importer); 250 CollationSettings &ownedSettings = *SharedObject::copyOnWrite(tailoring->settings); 251 parser.parse(ruleString, ownedSettings, outParseError, errorCode); 252 errorReason = parser.getErrorReason(); 253 if(U_FAILURE(errorCode)) { return NULL; } 254 if(dataBuilder->hasMappings()) { 255 makeTailoredCEs(errorCode); 256 closeOverComposites(errorCode); 257 finalizeCEs(errorCode); 258 // Copy all of ASCII, and Latin-1 letters, into each tailoring. 259 optimizeSet.add(0, 0x7f); 260 optimizeSet.add(0xc0, 0xff); 261 // Hangul is decomposed on the fly during collation, 262 // and the tailoring data is always built with HANGUL_TAG specials. 263 optimizeSet.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END); 264 dataBuilder->optimize(optimizeSet, errorCode); 265 tailoring->ensureOwnedData(errorCode); 266 if(U_FAILURE(errorCode)) { return NULL; } 267 if(fastLatinEnabled) { dataBuilder->enableFastLatin(); } 268 dataBuilder->build(*tailoring->ownedData, errorCode); 269 tailoring->builder = dataBuilder; 270 dataBuilder = NULL; 271 } else { 272 tailoring->data = baseData; 273 } 274 if(U_FAILURE(errorCode)) { return NULL; } 275 ownedSettings.fastLatinOptions = CollationFastLatin::getOptions( 276 tailoring->data, ownedSettings, 277 ownedSettings.fastLatinPrimaries, LENGTHOF(ownedSettings.fastLatinPrimaries)); 278 tailoring->rules = ruleString; 279 tailoring->rules.getTerminatedBuffer(); // ensure NUL-termination 280 tailoring->setVersion(base->version, rulesVersion); 281 return tailoring.orphan(); 282 } 283 284 void 285 CollationBuilder::addReset(int32_t strength, const UnicodeString &str, 286 const char *&parserErrorReason, UErrorCode &errorCode) { 287 if(U_FAILURE(errorCode)) { return; } 288 U_ASSERT(!str.isEmpty()); 289 if(str.charAt(0) == CollationRuleParser::POS_LEAD) { 290 ces[0] = getSpecialResetPosition(str, parserErrorReason, errorCode); 291 cesLength = 1; 292 if(U_FAILURE(errorCode)) { return; } 293 U_ASSERT((ces[0] & Collation::CASE_AND_QUATERNARY_MASK) == 0); 294 } else { 295 // normal reset to a character or string 296 UnicodeString nfdString = nfd.normalize(str, errorCode); 297 if(U_FAILURE(errorCode)) { 298 parserErrorReason = "normalizing the reset position"; 299 return; 300 } 301 cesLength = dataBuilder->getCEs(nfdString, ces, 0); 302 if(cesLength > Collation::MAX_EXPANSION_LENGTH) { 303 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 304 parserErrorReason = "reset position maps to too many collation elements (more than 31)"; 305 return; 306 } 307 } 308 if(strength == UCOL_IDENTICAL) { return; } // simple reset-at-position 309 310 // &[before strength]position 311 U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_TERTIARY); 312 int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode); 313 if(U_FAILURE(errorCode)) { return; } 314 315 int64_t node = nodes.elementAti(index); 316 // If the index is for a "weaker" tailored node, 317 // then skip backwards over this and further "weaker" nodes. 318 while(strengthFromNode(node) > strength) { 319 index = previousIndexFromNode(node); 320 node = nodes.elementAti(index); 321 } 322 323 // Find or insert a node whose index we will put into a temporary CE. 324 if(strengthFromNode(node) == strength && isTailoredNode(node)) { 325 // Reset to just before this same-strength tailored node. 326 index = previousIndexFromNode(node); 327 } else if(strength == UCOL_PRIMARY) { 328 // root primary node (has no previous index) 329 uint32_t p = weight32FromNode(node); 330 if(p == 0) { 331 errorCode = U_UNSUPPORTED_ERROR; 332 parserErrorReason = "reset primary-before ignorable not possible"; 333 return; 334 } 335 if(p <= rootElements.getFirstPrimary()) { 336 // There is no primary gap between ignorables and the space-first-primary. 337 errorCode = U_UNSUPPORTED_ERROR; 338 parserErrorReason = "reset primary-before first non-ignorable not supported"; 339 return; 340 } 341 if(p == Collation::FIRST_TRAILING_PRIMARY) { 342 // We do not support tailoring to an unassigned-implicit CE. 343 errorCode = U_UNSUPPORTED_ERROR; 344 parserErrorReason = "reset primary-before [first trailing] not supported"; 345 return; 346 } 347 p = rootElements.getPrimaryBefore(p, baseData->isCompressiblePrimary(p)); 348 index = findOrInsertNodeForPrimary(p, errorCode); 349 // Go to the last node in this list: 350 // Tailor after the last node between adjacent root nodes. 351 for(;;) { 352 node = nodes.elementAti(index); 353 int32_t nextIndex = nextIndexFromNode(node); 354 if(nextIndex == 0) { break; } 355 index = nextIndex; 356 } 357 } else { 358 // &[before 2] or &[before 3] 359 index = findCommonNode(index, UCOL_SECONDARY); 360 if(strength >= UCOL_TERTIARY) { 361 index = findCommonNode(index, UCOL_TERTIARY); 362 } 363 node = nodes.elementAti(index); 364 if(strengthFromNode(node) == strength) { 365 // Found a same-strength node with an explicit weight. 366 uint32_t weight16 = weight16FromNode(node); 367 if(weight16 == 0) { 368 errorCode = U_UNSUPPORTED_ERROR; 369 if(strength == UCOL_SECONDARY) { 370 parserErrorReason = "reset secondary-before secondary ignorable not possible"; 371 } else { 372 parserErrorReason = "reset tertiary-before completely ignorable not possible"; 373 } 374 return; 375 } 376 U_ASSERT(weight16 >= Collation::COMMON_WEIGHT16); 377 int32_t previousIndex = previousIndexFromNode(node); 378 if(weight16 == Collation::COMMON_WEIGHT16) { 379 // Reset to just before this same-strength common-weight node. 380 index = previousIndex; 381 } else { 382 // A non-common weight is only possible from a root CE. 383 // Find the higher-level weights, which must all be explicit, 384 // and then find the preceding weight for this level. 385 uint32_t previousWeight16 = 0; 386 int32_t previousWeightIndex = -1; 387 int32_t i = index; 388 if(strength == UCOL_SECONDARY) { 389 uint32_t p; 390 do { 391 i = previousIndexFromNode(node); 392 node = nodes.elementAti(i); 393 if(strengthFromNode(node) == UCOL_SECONDARY && !isTailoredNode(node) && 394 previousWeightIndex < 0) { 395 previousWeightIndex = i; 396 previousWeight16 = weight16FromNode(node); 397 } 398 } while(strengthFromNode(node) > UCOL_PRIMARY); 399 U_ASSERT(!isTailoredNode(node)); 400 p = weight32FromNode(node); 401 weight16 = rootElements.getSecondaryBefore(p, weight16); 402 } else { 403 uint32_t p, s; 404 do { 405 i = previousIndexFromNode(node); 406 node = nodes.elementAti(i); 407 if(strengthFromNode(node) == UCOL_TERTIARY && !isTailoredNode(node) && 408 previousWeightIndex < 0) { 409 previousWeightIndex = i; 410 previousWeight16 = weight16FromNode(node); 411 } 412 } while(strengthFromNode(node) > UCOL_SECONDARY); 413 U_ASSERT(!isTailoredNode(node)); 414 if(strengthFromNode(node) == UCOL_SECONDARY) { 415 s = weight16FromNode(node); 416 do { 417 i = previousIndexFromNode(node); 418 node = nodes.elementAti(i); 419 } while(strengthFromNode(node) > UCOL_PRIMARY); 420 U_ASSERT(!isTailoredNode(node)); 421 } else { 422 U_ASSERT(!nodeHasBefore2(node)); 423 s = Collation::COMMON_WEIGHT16; 424 } 425 p = weight32FromNode(node); 426 weight16 = rootElements.getTertiaryBefore(p, s, weight16); 427 U_ASSERT((weight16 & ~Collation::ONLY_TERTIARY_MASK) == 0); 428 } 429 // Find or insert the new explicit weight before the current one. 430 if(previousWeightIndex >= 0 && weight16 == previousWeight16) { 431 // Tailor after the last node between adjacent root nodes. 432 index = previousIndex; 433 } else { 434 node = nodeFromWeight16(weight16) | nodeFromStrength(strength); 435 index = insertNodeBetween(previousIndex, index, node, errorCode); 436 } 437 } 438 } else { 439 // Found a stronger node with implied strength-common weight. 440 int64_t hasBefore3 = 0; 441 if(strength == UCOL_SECONDARY) { 442 U_ASSERT(!nodeHasBefore2(node)); 443 // Move the HAS_BEFORE3 flag from the parent node 444 // to the new secondary common node. 445 hasBefore3 = node & HAS_BEFORE3; 446 node = (node & ~(int64_t)HAS_BEFORE3) | HAS_BEFORE2; 447 } else { 448 U_ASSERT(!nodeHasBefore3(node)); 449 node |= HAS_BEFORE3; 450 } 451 nodes.setElementAt(node, index); 452 int32_t nextIndex = nextIndexFromNode(node); 453 // Insert default nodes with weights 02 and 05, reset to the 02 node. 454 node = nodeFromWeight16(BEFORE_WEIGHT16) | nodeFromStrength(strength); 455 index = insertNodeBetween(index, nextIndex, node, errorCode); 456 node = nodeFromWeight16(Collation::COMMON_WEIGHT16) | hasBefore3 | 457 nodeFromStrength(strength); 458 insertNodeBetween(index, nextIndex, node, errorCode); 459 } 460 // Strength of the temporary CE = strength of its reset position. 461 // Code above raises an error if the before-strength is stronger. 462 strength = ceStrength(ces[cesLength - 1]); 463 } 464 if(U_FAILURE(errorCode)) { 465 parserErrorReason = "inserting reset position for &[before n]"; 466 return; 467 } 468 ces[cesLength - 1] = tempCEFromIndexAndStrength(index, strength); 469 } 470 471 int64_t 472 CollationBuilder::getSpecialResetPosition(const UnicodeString &str, 473 const char *&parserErrorReason, UErrorCode &errorCode) { 474 U_ASSERT(str.length() == 2); 475 int64_t ce; 476 int32_t strength = UCOL_PRIMARY; 477 UBool isBoundary = FALSE; 478 UChar32 pos = str.charAt(1) - CollationRuleParser::POS_BASE; 479 U_ASSERT(0 <= pos && pos <= CollationRuleParser::LAST_TRAILING); 480 switch(pos) { 481 case CollationRuleParser::FIRST_TERTIARY_IGNORABLE: 482 // Quaternary CEs are not supported. 483 // Non-zero quaternary weights are possible only on tertiary or stronger CEs. 484 return 0; 485 case CollationRuleParser::LAST_TERTIARY_IGNORABLE: 486 return 0; 487 case CollationRuleParser::FIRST_SECONDARY_IGNORABLE: { 488 // Look for a tailored tertiary node after [0, 0, 0]. 489 int32_t index = findOrInsertNodeForRootCE(0, UCOL_TERTIARY, errorCode); 490 if(U_FAILURE(errorCode)) { return 0; } 491 int64_t node = nodes.elementAti(index); 492 if((index = nextIndexFromNode(node)) != 0) { 493 node = nodes.elementAti(index); 494 U_ASSERT(strengthFromNode(node) <= UCOL_TERTIARY); 495 if(isTailoredNode(node) && strengthFromNode(node) == UCOL_TERTIARY) { 496 return tempCEFromIndexAndStrength(index, UCOL_TERTIARY); 497 } 498 } 499 return rootElements.getFirstTertiaryCE(); 500 // No need to look for nodeHasAnyBefore() on a tertiary node. 501 } 502 case CollationRuleParser::LAST_SECONDARY_IGNORABLE: 503 ce = rootElements.getLastTertiaryCE(); 504 strength = UCOL_TERTIARY; 505 break; 506 case CollationRuleParser::FIRST_PRIMARY_IGNORABLE: { 507 // Look for a tailored secondary node after [0, 0, *]. 508 int32_t index = findOrInsertNodeForRootCE(0, UCOL_SECONDARY, errorCode); 509 if(U_FAILURE(errorCode)) { return 0; } 510 int64_t node = nodes.elementAti(index); 511 while((index = nextIndexFromNode(node)) != 0) { 512 node = nodes.elementAti(index); 513 strength = strengthFromNode(node); 514 if(strength < UCOL_SECONDARY) { break; } 515 if(strength == UCOL_SECONDARY) { 516 if(isTailoredNode(node)) { 517 if(nodeHasBefore3(node)) { 518 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 519 U_ASSERT(isTailoredNode(nodes.elementAti(index))); 520 } 521 return tempCEFromIndexAndStrength(index, UCOL_SECONDARY); 522 } else { 523 break; 524 } 525 } 526 } 527 ce = rootElements.getFirstSecondaryCE(); 528 strength = UCOL_SECONDARY; 529 break; 530 } 531 case CollationRuleParser::LAST_PRIMARY_IGNORABLE: 532 ce = rootElements.getLastSecondaryCE(); 533 strength = UCOL_SECONDARY; 534 break; 535 case CollationRuleParser::FIRST_VARIABLE: 536 ce = rootElements.getFirstPrimaryCE(); 537 isBoundary = TRUE; // FractionalUCA.txt: FDD1 00A0, SPACE first primary 538 break; 539 case CollationRuleParser::LAST_VARIABLE: 540 ce = rootElements.lastCEWithPrimaryBefore(variableTop + 1); 541 break; 542 case CollationRuleParser::FIRST_REGULAR: 543 ce = rootElements.firstCEWithPrimaryAtLeast(variableTop + 1); 544 isBoundary = TRUE; // FractionalUCA.txt: FDD1 263A, SYMBOL first primary 545 break; 546 case CollationRuleParser::LAST_REGULAR: 547 // Use the Hani-first-primary rather than the actual last "regular" CE before it, 548 // for backward compatibility with behavior before the introduction of 549 // script-first-primary CEs in the root collator. 550 ce = rootElements.firstCEWithPrimaryAtLeast( 551 baseData->getFirstPrimaryForGroup(USCRIPT_HAN)); 552 break; 553 case CollationRuleParser::FIRST_IMPLICIT: { 554 uint32_t ce32 = baseData->getCE32(0x4e00); 555 U_ASSERT(Collation::hasCE32Tag(ce32, Collation::OFFSET_TAG)); 556 ce = baseData->getCEFromOffsetCE32(0x4e00, ce32); 557 break; 558 } 559 case CollationRuleParser::LAST_IMPLICIT: 560 // We do not support tailoring to an unassigned-implicit CE. 561 errorCode = U_UNSUPPORTED_ERROR; 562 parserErrorReason = "reset to [last implicit] not supported"; 563 return 0; 564 case CollationRuleParser::FIRST_TRAILING: 565 ce = Collation::makeCE(Collation::FIRST_TRAILING_PRIMARY); 566 isBoundary = TRUE; // trailing first primary (there is no mapping for it) 567 break; 568 case CollationRuleParser::LAST_TRAILING: 569 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 570 parserErrorReason = "LDML forbids tailoring to U+FFFF"; 571 return 0; 572 default: 573 U_ASSERT(FALSE); 574 return 0; 575 } 576 577 int32_t index = findOrInsertNodeForRootCE(ce, strength, errorCode); 578 if(U_FAILURE(errorCode)) { return 0; } 579 int64_t node = nodes.elementAti(index); 580 if((pos & 1) == 0) { 581 // even pos = [first xyz] 582 if(!nodeHasAnyBefore(node) && isBoundary) { 583 // A <group> first primary boundary is artificially added to FractionalUCA.txt. 584 // It is reachable via its special contraction, but is not normally used. 585 // Find the first character tailored after the boundary CE, 586 // or the first real root CE after it. 587 if((index = nextIndexFromNode(node)) != 0) { 588 // If there is a following node, then it must be tailored 589 // because there are no root CEs with a boundary primary 590 // and non-common secondary/tertiary weights. 591 node = nodes.elementAti(index); 592 U_ASSERT(isTailoredNode(node)); 593 ce = tempCEFromIndexAndStrength(index, strength); 594 } else { 595 U_ASSERT(strength == UCOL_PRIMARY); 596 uint32_t p = (uint32_t)(ce >> 32); 597 int32_t pIndex = rootElements.findPrimary(p); 598 UBool isCompressible = baseData->isCompressiblePrimary(p); 599 p = rootElements.getPrimaryAfter(p, pIndex, isCompressible); 600 ce = Collation::makeCE(p); 601 index = findOrInsertNodeForRootCE(ce, UCOL_PRIMARY, errorCode); 602 if(U_FAILURE(errorCode)) { return 0; } 603 node = nodes.elementAti(index); 604 } 605 } 606 if(nodeHasAnyBefore(node)) { 607 // Get the first node that was tailored before this one at a weaker strength. 608 if(nodeHasBefore2(node)) { 609 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 610 node = nodes.elementAti(index); 611 } 612 if(nodeHasBefore3(node)) { 613 index = nextIndexFromNode(nodes.elementAti(nextIndexFromNode(node))); 614 } 615 U_ASSERT(isTailoredNode(nodes.elementAti(index))); 616 ce = tempCEFromIndexAndStrength(index, strength); 617 } 618 } else { 619 // odd pos = [last xyz] 620 // Find the last node that was tailored after the [last xyz] 621 // at a strength no greater than the position's strength. 622 for(;;) { 623 int32_t nextIndex = nextIndexFromNode(node); 624 if(nextIndex == 0) { break; } 625 int64_t nextNode = nodes.elementAti(nextIndex); 626 if(strengthFromNode(nextNode) < strength) { break; } 627 index = nextIndex; 628 node = nextNode; 629 } 630 // Do not make a temporary CE for a root node. 631 // This last node might be the node for the root CE itself, 632 // or a node with a common secondary or tertiary weight. 633 if(isTailoredNode(node)) { 634 ce = tempCEFromIndexAndStrength(index, strength); 635 } 636 } 637 return ce; 638 } 639 640 void 641 CollationBuilder::addRelation(int32_t strength, const UnicodeString &prefix, 642 const UnicodeString &str, const UnicodeString &extension, 643 const char *&parserErrorReason, UErrorCode &errorCode) { 644 if(U_FAILURE(errorCode)) { return; } 645 UnicodeString nfdPrefix; 646 if(!prefix.isEmpty()) { 647 nfd.normalize(prefix, nfdPrefix, errorCode); 648 if(U_FAILURE(errorCode)) { 649 parserErrorReason = "normalizing the relation prefix"; 650 return; 651 } 652 } 653 UnicodeString nfdString = nfd.normalize(str, errorCode); 654 if(U_FAILURE(errorCode)) { 655 parserErrorReason = "normalizing the relation string"; 656 return; 657 } 658 659 // The runtime code decomposes Hangul syllables on the fly, 660 // with recursive processing but without making the Jamo pieces visible for matching. 661 // It does not work with certain types of contextual mappings. 662 int32_t nfdLength = nfdString.length(); 663 if(nfdLength >= 2) { 664 UChar c = nfdString.charAt(0); 665 if(Hangul::isJamoL(c) || Hangul::isJamoV(c)) { 666 // While handling a Hangul syllable, contractions starting with Jamo L or V 667 // would not see the following Jamo of that syllable. 668 errorCode = U_UNSUPPORTED_ERROR; 669 parserErrorReason = "contractions starting with conjoining Jamo L or V not supported"; 670 return; 671 } 672 c = nfdString.charAt(nfdLength - 1); 673 if(Hangul::isJamoL(c) || 674 (Hangul::isJamoV(c) && Hangul::isJamoL(nfdString.charAt(nfdLength - 2)))) { 675 // A contraction ending with Jamo L or L+V would require 676 // generating Hangul syllables in addTailComposites() (588 for a Jamo L), 677 // or decomposing a following Hangul syllable on the fly, during contraction matching. 678 errorCode = U_UNSUPPORTED_ERROR; 679 parserErrorReason = "contractions ending with conjoining Jamo L or L+V not supported"; 680 return; 681 } 682 // A Hangul syllable completely inside a contraction is ok. 683 } 684 // Note: If there is a prefix, then the parser checked that 685 // both the prefix and the string beging with NFC boundaries (not Jamo V or T). 686 // Therefore: prefix.isEmpty() || !isJamoVOrT(nfdString.charAt(0)) 687 // (While handling a Hangul syllable, prefixes on Jamo V or T 688 // would not see the previous Jamo of that syllable.) 689 690 if(strength != UCOL_IDENTICAL) { 691 // Find the node index after which we insert the new tailored node. 692 int32_t index = findOrInsertNodeForCEs(strength, parserErrorReason, errorCode); 693 U_ASSERT(cesLength > 0); 694 int64_t ce = ces[cesLength - 1]; 695 if(strength == UCOL_PRIMARY && !isTempCE(ce) && (uint32_t)(ce >> 32) == 0) { 696 // There is no primary gap between ignorables and the space-first-primary. 697 errorCode = U_UNSUPPORTED_ERROR; 698 parserErrorReason = "tailoring primary after ignorables not supported"; 699 return; 700 } 701 if(strength == UCOL_QUATERNARY && ce == 0) { 702 // The CE data structure does not support non-zero quaternary weights 703 // on tertiary ignorables. 704 errorCode = U_UNSUPPORTED_ERROR; 705 parserErrorReason = "tailoring quaternary after tertiary ignorables not supported"; 706 return; 707 } 708 // Insert the new tailored node. 709 index = insertTailoredNodeAfter(index, strength, errorCode); 710 if(U_FAILURE(errorCode)) { 711 parserErrorReason = "modifying collation elements"; 712 return; 713 } 714 // Strength of the temporary CE: 715 // The new relation may yield a stronger CE but not a weaker one. 716 int32_t tempStrength = ceStrength(ce); 717 if(strength < tempStrength) { tempStrength = strength; } 718 ces[cesLength - 1] = tempCEFromIndexAndStrength(index, tempStrength); 719 } 720 721 setCaseBits(nfdString, parserErrorReason, errorCode); 722 if(U_FAILURE(errorCode)) { return; } 723 724 int32_t cesLengthBeforeExtension = cesLength; 725 if(!extension.isEmpty()) { 726 UnicodeString nfdExtension = nfd.normalize(extension, errorCode); 727 if(U_FAILURE(errorCode)) { 728 parserErrorReason = "normalizing the relation extension"; 729 return; 730 } 731 cesLength = dataBuilder->getCEs(nfdExtension, ces, cesLength); 732 if(cesLength > Collation::MAX_EXPANSION_LENGTH) { 733 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 734 parserErrorReason = 735 "extension string adds too many collation elements (more than 31 total)"; 736 return; 737 } 738 } 739 uint32_t ce32 = Collation::UNASSIGNED_CE32; 740 if((prefix != nfdPrefix || str != nfdString) && 741 !ignorePrefix(prefix, errorCode) && !ignoreString(str, errorCode)) { 742 // Map from the original input to the CEs. 743 // We do this in case the canonical closure is incomplete, 744 // so that it is possible to explicitly provide the missing mappings. 745 ce32 = addIfDifferent(prefix, str, ces, cesLength, ce32, errorCode); 746 } 747 addWithClosure(nfdPrefix, nfdString, ces, cesLength, ce32, errorCode); 748 if(U_FAILURE(errorCode)) { 749 parserErrorReason = "writing collation elements"; 750 return; 751 } 752 cesLength = cesLengthBeforeExtension; 753 } 754 755 int32_t 756 CollationBuilder::findOrInsertNodeForCEs(int32_t strength, const char *&parserErrorReason, 757 UErrorCode &errorCode) { 758 if(U_FAILURE(errorCode)) { return 0; } 759 U_ASSERT(UCOL_PRIMARY <= strength && strength <= UCOL_QUATERNARY); 760 761 // Find the last CE that is at least as "strong" as the requested difference. 762 // Note: Stronger is smaller (UCOL_PRIMARY=0). 763 int64_t ce; 764 for(;; --cesLength) { 765 if(cesLength == 0) { 766 ce = ces[0] = 0; 767 cesLength = 1; 768 break; 769 } else { 770 ce = ces[cesLength - 1]; 771 } 772 if(ceStrength(ce) <= strength) { break; } 773 } 774 775 if(isTempCE(ce)) { 776 // No need to findCommonNode() here for lower levels 777 // because insertTailoredNodeAfter() will do that anyway. 778 return indexFromTempCE(ce); 779 } 780 781 // root CE 782 if((uint8_t)(ce >> 56) == Collation::UNASSIGNED_IMPLICIT_BYTE) { 783 errorCode = U_UNSUPPORTED_ERROR; 784 parserErrorReason = "tailoring relative to an unassigned code point not supported"; 785 return 0; 786 } 787 return findOrInsertNodeForRootCE(ce, strength, errorCode); 788 } 789 790 int32_t 791 CollationBuilder::findOrInsertNodeForRootCE(int64_t ce, int32_t strength, UErrorCode &errorCode) { 792 if(U_FAILURE(errorCode)) { return 0; } 793 U_ASSERT((uint8_t)(ce >> 56) != Collation::UNASSIGNED_IMPLICIT_BYTE); 794 795 // Find or insert the node for each of the root CE's weights, 796 // down to the requested level/strength. 797 // Root CEs must have common=zero quaternary weights (for which we never insert any nodes). 798 U_ASSERT((ce & 0xc0) == 0); 799 int32_t index = findOrInsertNodeForPrimary((uint32_t)(ce >> 32) , errorCode); 800 if(strength >= UCOL_SECONDARY) { 801 uint32_t lower32 = (uint32_t)ce; 802 index = findOrInsertWeakNode(index, lower32 >> 16, UCOL_SECONDARY, errorCode); 803 if(strength >= UCOL_TERTIARY) { 804 index = findOrInsertWeakNode(index, lower32 & Collation::ONLY_TERTIARY_MASK, 805 UCOL_TERTIARY, errorCode); 806 } 807 } 808 return index; 809 } 810 811 namespace { 812 813 /** 814 * Like Java Collections.binarySearch(List, key, Comparator). 815 * 816 * @return the index>=0 where the item was found, 817 * or the index<0 for inserting the string at ~index in sorted order 818 * (index into rootPrimaryIndexes) 819 */ 820 int32_t 821 binarySearchForRootPrimaryNode(const int32_t *rootPrimaryIndexes, int32_t length, 822 const int64_t *nodes, uint32_t p) { 823 if(length == 0) { return ~0; } 824 int32_t start = 0; 825 int32_t limit = length; 826 for (;;) { 827 int32_t i = (start + limit) / 2; 828 int64_t node = nodes[rootPrimaryIndexes[i]]; 829 uint32_t nodePrimary = (uint32_t)(node >> 32); // weight32FromNode(node) 830 if (p == nodePrimary) { 831 return i; 832 } else if (p < nodePrimary) { 833 if (i == start) { 834 return ~start; // insert s before i 835 } 836 limit = i; 837 } else { 838 if (i == start) { 839 return ~(start + 1); // insert s after i 840 } 841 start = i; 842 } 843 } 844 } 845 846 } // namespace 847 848 int32_t 849 CollationBuilder::findOrInsertNodeForPrimary(uint32_t p, UErrorCode &errorCode) { 850 if(U_FAILURE(errorCode)) { return 0; } 851 852 int32_t rootIndex = binarySearchForRootPrimaryNode( 853 rootPrimaryIndexes.getBuffer(), rootPrimaryIndexes.size(), nodes.getBuffer(), p); 854 if(rootIndex >= 0) { 855 return rootPrimaryIndexes.elementAti(rootIndex); 856 } else { 857 // Start a new list of nodes with this primary. 858 int32_t index = nodes.size(); 859 nodes.addElement(nodeFromWeight32(p), errorCode); 860 rootPrimaryIndexes.insertElementAt(index, ~rootIndex, errorCode); 861 return index; 862 } 863 } 864 865 int32_t 866 CollationBuilder::findOrInsertWeakNode(int32_t index, uint32_t weight16, int32_t level, UErrorCode &errorCode) { 867 if(U_FAILURE(errorCode)) { return 0; } 868 U_ASSERT(0 <= index && index < nodes.size()); 869 870 U_ASSERT(weight16 == 0 || weight16 >= Collation::COMMON_WEIGHT16); 871 // Only reset-before inserts common weights. 872 if(weight16 == Collation::COMMON_WEIGHT16) { 873 return findCommonNode(index, level); 874 } 875 // Find the root CE's weight for this level. 876 // Postpone insertion if not found: 877 // Insert the new root node before the next stronger node, 878 // or before the next root node with the same strength and a larger weight. 879 int64_t node = nodes.elementAti(index); 880 int32_t nextIndex; 881 while((nextIndex = nextIndexFromNode(node)) != 0) { 882 node = nodes.elementAti(nextIndex); 883 int32_t nextStrength = strengthFromNode(node); 884 if(nextStrength <= level) { 885 // Insert before a stronger node. 886 if(nextStrength < level) { break; } 887 // nextStrength == level 888 if(!isTailoredNode(node)) { 889 uint32_t nextWeight16 = weight16FromNode(node); 890 if(nextWeight16 == weight16) { 891 // Found the node for the root CE up to this level. 892 return nextIndex; 893 } 894 // Insert before a node with a larger same-strength weight. 895 if(nextWeight16 > weight16) { break; } 896 } 897 } 898 // Skip the next node. 899 index = nextIndex; 900 } 901 node = nodeFromWeight16(weight16) | nodeFromStrength(level); 902 return insertNodeBetween(index, nextIndex, node, errorCode); 903 } 904 905 int32_t 906 CollationBuilder::insertTailoredNodeAfter(int32_t index, int32_t strength, UErrorCode &errorCode) { 907 if(U_FAILURE(errorCode)) { return 0; } 908 U_ASSERT(0 <= index && index < nodes.size()); 909 if(strength >= UCOL_SECONDARY) { 910 index = findCommonNode(index, UCOL_SECONDARY); 911 if(strength >= UCOL_TERTIARY) { 912 index = findCommonNode(index, UCOL_TERTIARY); 913 } 914 } 915 // Postpone insertion: 916 // Insert the new node before the next one with a strength at least as strong. 917 int64_t node = nodes.elementAti(index); 918 int32_t nextIndex; 919 while((nextIndex = nextIndexFromNode(node)) != 0) { 920 node = nodes.elementAti(nextIndex); 921 if(strengthFromNode(node) <= strength) { break; } 922 // Skip the next node which has a weaker (larger) strength than the new one. 923 index = nextIndex; 924 } 925 node = IS_TAILORED | nodeFromStrength(strength); 926 return insertNodeBetween(index, nextIndex, node, errorCode); 927 } 928 929 int32_t 930 CollationBuilder::insertNodeBetween(int32_t index, int32_t nextIndex, int64_t node, 931 UErrorCode &errorCode) { 932 if(U_FAILURE(errorCode)) { return 0; } 933 U_ASSERT(previousIndexFromNode(node) == 0); 934 U_ASSERT(nextIndexFromNode(node) == 0); 935 U_ASSERT(nextIndexFromNode(nodes.elementAti(index)) == nextIndex); 936 // Append the new node and link it to the existing nodes. 937 int32_t newIndex = nodes.size(); 938 node |= nodeFromPreviousIndex(index) | nodeFromNextIndex(nextIndex); 939 nodes.addElement(node, errorCode); 940 if(U_FAILURE(errorCode)) { return 0; } 941 // nodes[index].nextIndex = newIndex 942 node = nodes.elementAti(index); 943 nodes.setElementAt(changeNodeNextIndex(node, newIndex), index); 944 // nodes[nextIndex].previousIndex = newIndex 945 if(nextIndex != 0) { 946 node = nodes.elementAti(nextIndex); 947 nodes.setElementAt(changeNodePreviousIndex(node, newIndex), nextIndex); 948 } 949 return newIndex; 950 } 951 952 int32_t 953 CollationBuilder::findCommonNode(int32_t index, int32_t strength) const { 954 U_ASSERT(UCOL_SECONDARY <= strength && strength <= UCOL_TERTIARY); 955 int64_t node = nodes.elementAti(index); 956 if(strengthFromNode(node) >= strength) { 957 // The current node is no stronger. 958 return index; 959 } 960 if(strength == UCOL_SECONDARY ? !nodeHasBefore2(node) : !nodeHasBefore3(node)) { 961 // The current node implies the strength-common weight. 962 return index; 963 } 964 index = nextIndexFromNode(node); 965 node = nodes.elementAti(index); 966 U_ASSERT(!isTailoredNode(node) && strengthFromNode(node) == strength && 967 weight16FromNode(node) == BEFORE_WEIGHT16); 968 // Skip to the explicit common node. 969 do { 970 index = nextIndexFromNode(node); 971 node = nodes.elementAti(index); 972 U_ASSERT(strengthFromNode(node) >= strength); 973 } while(isTailoredNode(node) || strengthFromNode(node) > strength); 974 U_ASSERT(weight16FromNode(node) == Collation::COMMON_WEIGHT16); 975 return index; 976 } 977 978 void 979 CollationBuilder::setCaseBits(const UnicodeString &nfdString, 980 const char *&parserErrorReason, UErrorCode &errorCode) { 981 if(U_FAILURE(errorCode)) { return; } 982 int32_t numTailoredPrimaries = 0; 983 for(int32_t i = 0; i < cesLength; ++i) { 984 if(ceStrength(ces[i]) == UCOL_PRIMARY) { ++numTailoredPrimaries; } 985 } 986 // We should not be able to get too many case bits because 987 // cesLength<=31==MAX_EXPANSION_LENGTH. 988 // 31 pairs of case bits fit into an int64_t without setting its sign bit. 989 U_ASSERT(numTailoredPrimaries <= 31); 990 991 int64_t cases = 0; 992 if(numTailoredPrimaries > 0) { 993 const UChar *s = nfdString.getBuffer(); 994 UTF16CollationIterator baseCEs(baseData, FALSE, s, s, s + nfdString.length()); 995 int32_t baseCEsLength = baseCEs.fetchCEs(errorCode) - 1; 996 if(U_FAILURE(errorCode)) { 997 parserErrorReason = "fetching root CEs for tailored string"; 998 return; 999 } 1000 U_ASSERT(baseCEsLength >= 0 && baseCEs.getCE(baseCEsLength) == Collation::NO_CE); 1001 1002 uint32_t lastCase = 0; 1003 int32_t numBasePrimaries = 0; 1004 for(int32_t i = 0; i < baseCEsLength; ++i) { 1005 int64_t ce = baseCEs.getCE(i); 1006 if((ce >> 32) != 0) { 1007 ++numBasePrimaries; 1008 uint32_t c = ((uint32_t)ce >> 14) & 3; 1009 U_ASSERT(c == 0 || c == 2); // lowercase or uppercase, no mixed case in any base CE 1010 if(numBasePrimaries < numTailoredPrimaries) { 1011 cases |= (int64_t)c << ((numBasePrimaries - 1) * 2); 1012 } else if(numBasePrimaries == numTailoredPrimaries) { 1013 lastCase = c; 1014 } else if(c != lastCase) { 1015 // There are more base primary CEs than tailored primaries. 1016 // Set mixed case if the case bits of the remainder differ. 1017 lastCase = 1; 1018 // Nothing more can change. 1019 break; 1020 } 1021 } 1022 } 1023 if(numBasePrimaries >= numTailoredPrimaries) { 1024 cases |= (int64_t)lastCase << ((numTailoredPrimaries - 1) * 2); 1025 } 1026 } 1027 1028 for(int32_t i = 0; i < cesLength; ++i) { 1029 int64_t ce = ces[i] & INT64_C(0xffffffffffff3fff); // clear old case bits 1030 int32_t strength = ceStrength(ce); 1031 if(strength == UCOL_PRIMARY) { 1032 ce |= (cases & 3) << 14; 1033 cases >>= 2; 1034 } else if(strength == UCOL_TERTIARY) { 1035 // Tertiary CEs must have uppercase bits. 1036 // See the LDML spec, and comments in class CollationCompare. 1037 ce |= 0x8000; 1038 } 1039 // Tertiary ignorable CEs must have 0 case bits. 1040 // We set 0 case bits for secondary CEs too 1041 // since currently only U+0345 is cased and maps to a secondary CE, 1042 // and it is lowercase. Other secondaries are uncased. 1043 // See [[:Cased:]&[:uca1=:]] where uca1 queries the root primary weight. 1044 ces[i] = ce; 1045 } 1046 } 1047 1048 void 1049 CollationBuilder::suppressContractions(const UnicodeSet &set, const char *&parserErrorReason, 1050 UErrorCode &errorCode) { 1051 if(U_FAILURE(errorCode)) { return; } 1052 dataBuilder->suppressContractions(set, errorCode); 1053 if(U_FAILURE(errorCode)) { 1054 parserErrorReason = "application of [suppressContractions [set]] failed"; 1055 } 1056 } 1057 1058 void 1059 CollationBuilder::optimize(const UnicodeSet &set, const char *& /* parserErrorReason */, 1060 UErrorCode &errorCode) { 1061 if(U_FAILURE(errorCode)) { return; } 1062 optimizeSet.addAll(set); 1063 } 1064 1065 uint32_t 1066 CollationBuilder::addWithClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, 1067 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, 1068 UErrorCode &errorCode) { 1069 // Map from the NFD input to the CEs. 1070 ce32 = addIfDifferent(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode); 1071 ce32 = addOnlyClosure(nfdPrefix, nfdString, newCEs, newCEsLength, ce32, errorCode); 1072 addTailComposites(nfdPrefix, nfdString, errorCode); 1073 return ce32; 1074 } 1075 1076 uint32_t 1077 CollationBuilder::addOnlyClosure(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, 1078 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, 1079 UErrorCode &errorCode) { 1080 if(U_FAILURE(errorCode)) { return ce32; } 1081 1082 // Map from canonically equivalent input to the CEs. (But not from the all-NFD input.) 1083 if(nfdPrefix.isEmpty()) { 1084 CanonicalIterator stringIter(nfdString, errorCode); 1085 if(U_FAILURE(errorCode)) { return ce32; } 1086 UnicodeString prefix; 1087 for(;;) { 1088 UnicodeString str = stringIter.next(); 1089 if(str.isBogus()) { break; } 1090 if(ignoreString(str, errorCode) || str == nfdString) { continue; } 1091 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode); 1092 if(U_FAILURE(errorCode)) { return ce32; } 1093 } 1094 } else { 1095 CanonicalIterator prefixIter(nfdPrefix, errorCode); 1096 CanonicalIterator stringIter(nfdString, errorCode); 1097 if(U_FAILURE(errorCode)) { return ce32; } 1098 for(;;) { 1099 UnicodeString prefix = prefixIter.next(); 1100 if(prefix.isBogus()) { break; } 1101 if(ignorePrefix(prefix, errorCode)) { continue; } 1102 UBool samePrefix = prefix == nfdPrefix; 1103 for(;;) { 1104 UnicodeString str = stringIter.next(); 1105 if(str.isBogus()) { break; } 1106 if(ignoreString(str, errorCode) || (samePrefix && str == nfdString)) { continue; } 1107 ce32 = addIfDifferent(prefix, str, newCEs, newCEsLength, ce32, errorCode); 1108 if(U_FAILURE(errorCode)) { return ce32; } 1109 } 1110 stringIter.reset(); 1111 } 1112 } 1113 return ce32; 1114 } 1115 1116 void 1117 CollationBuilder::addTailComposites(const UnicodeString &nfdPrefix, const UnicodeString &nfdString, 1118 UErrorCode &errorCode) { 1119 if(U_FAILURE(errorCode)) { return; } 1120 1121 // Look for the last starter in the NFD string. 1122 UChar32 lastStarter; 1123 int32_t indexAfterLastStarter = nfdString.length(); 1124 for(;;) { 1125 if(indexAfterLastStarter == 0) { return; } // no starter at all 1126 lastStarter = nfdString.char32At(indexAfterLastStarter - 1); 1127 if(nfd.getCombiningClass(lastStarter) == 0) { break; } 1128 indexAfterLastStarter -= U16_LENGTH(lastStarter); 1129 } 1130 // No closure to Hangul syllables since we decompose them on the fly. 1131 if(Hangul::isJamoL(lastStarter)) { return; } 1132 1133 // Are there any composites whose decomposition starts with the lastStarter? 1134 // Note: Normalizer2Impl does not currently return start sets for NFC_QC=Maybe characters. 1135 // We might find some more equivalent mappings here if it did. 1136 UnicodeSet composites; 1137 if(!nfcImpl.getCanonStartSet(lastStarter, composites)) { return; } 1138 1139 UnicodeString decomp; 1140 UnicodeString newNFDString, newString; 1141 int64_t newCEs[Collation::MAX_EXPANSION_LENGTH]; 1142 UnicodeSetIterator iter(composites); 1143 while(iter.next()) { 1144 U_ASSERT(!iter.isString()); 1145 UChar32 composite = iter.getCodepoint(); 1146 nfd.getDecomposition(composite, decomp); 1147 if(!mergeCompositeIntoString(nfdString, indexAfterLastStarter, composite, decomp, 1148 newNFDString, newString, errorCode)) { 1149 continue; 1150 } 1151 int32_t newCEsLength = dataBuilder->getCEs(nfdPrefix, newNFDString, newCEs, 0); 1152 if(newCEsLength > Collation::MAX_EXPANSION_LENGTH) { 1153 // Ignore mappings that we cannot store. 1154 continue; 1155 } 1156 // Note: It is possible that the newCEs do not make use of the mapping 1157 // for which we are adding the tail composites, in which case we might be adding 1158 // unnecessary mappings. 1159 // For example, when we add tail composites for ae^ (^=combining circumflex), 1160 // UCA discontiguous-contraction matching does not find any matches 1161 // for ae_^ (_=any combining diacritic below) *unless* there is also 1162 // a contraction mapping for ae. 1163 // Thus, if there is no ae contraction, then the ae^ mapping is ignored 1164 // while fetching the newCEs for ae_^. 1165 // TODO: Try to detect this effectively. 1166 // (Alternatively, print a warning when prefix contractions are missing.) 1167 1168 // We do not need an explicit mapping for the NFD strings. 1169 // It is fine if the NFD input collates like this via a sequence of mappings. 1170 // It also saves a little bit of space, and may reduce the set of characters with contractions. 1171 uint32_t ce32 = addIfDifferent(nfdPrefix, newString, 1172 newCEs, newCEsLength, Collation::UNASSIGNED_CE32, errorCode); 1173 if(ce32 != Collation::UNASSIGNED_CE32) { 1174 // was different, was added 1175 addOnlyClosure(nfdPrefix, newNFDString, newCEs, newCEsLength, ce32, errorCode); 1176 } 1177 } 1178 } 1179 1180 UBool 1181 CollationBuilder::mergeCompositeIntoString(const UnicodeString &nfdString, 1182 int32_t indexAfterLastStarter, 1183 UChar32 composite, const UnicodeString &decomp, 1184 UnicodeString &newNFDString, UnicodeString &newString, 1185 UErrorCode &errorCode) const { 1186 if(U_FAILURE(errorCode)) { return FALSE; } 1187 U_ASSERT(nfdString.char32At(indexAfterLastStarter - 1) == decomp.char32At(0)); 1188 int32_t lastStarterLength = decomp.moveIndex32(0, 1); 1189 if(lastStarterLength == decomp.length()) { 1190 // Singleton decompositions should be found by addWithClosure() 1191 // and the CanonicalIterator, so we can ignore them here. 1192 return FALSE; 1193 } 1194 if(nfdString.compare(indexAfterLastStarter, 0x7fffffff, 1195 decomp, lastStarterLength, 0x7fffffff) == 0) { 1196 // same strings, nothing new to be found here 1197 return FALSE; 1198 } 1199 1200 // Make new FCD strings that combine a composite, or its decomposition, 1201 // into the nfdString's last starter and the combining marks following it. 1202 // Make an NFD version, and a version with the composite. 1203 newNFDString.setTo(nfdString, 0, indexAfterLastStarter); 1204 newString.setTo(nfdString, 0, indexAfterLastStarter - lastStarterLength).append(composite); 1205 1206 // The following is related to discontiguous contraction matching, 1207 // but builds only FCD strings (or else returns FALSE). 1208 int32_t sourceIndex = indexAfterLastStarter; 1209 int32_t decompIndex = lastStarterLength; 1210 // Small optimization: We keep the source character across loop iterations 1211 // because we do not always consume it, 1212 // and then need not fetch it again nor look up its combining class again. 1213 UChar32 sourceChar = U_SENTINEL; 1214 // The cc variables need to be declared before the loop so that at the end 1215 // they are set to the last combining classes seen. 1216 uint8_t sourceCC = 0; 1217 uint8_t decompCC = 0; 1218 for(;;) { 1219 if(sourceChar < 0) { 1220 if(sourceIndex >= nfdString.length()) { break; } 1221 sourceChar = nfdString.char32At(sourceIndex); 1222 sourceCC = nfd.getCombiningClass(sourceChar); 1223 U_ASSERT(sourceCC != 0); 1224 } 1225 // We consume a decomposition character in each iteration. 1226 if(decompIndex >= decomp.length()) { break; } 1227 UChar32 decompChar = decomp.char32At(decompIndex); 1228 decompCC = nfd.getCombiningClass(decompChar); 1229 // Compare the two characters and their combining classes. 1230 if(decompCC == 0) { 1231 // Unable to merge because the source contains a non-zero combining mark 1232 // but the composite's decomposition contains another starter. 1233 // The strings would not be equivalent. 1234 return FALSE; 1235 } else if(sourceCC < decompCC) { 1236 // Composite + sourceChar would not be FCD. 1237 return FALSE; 1238 } else if(decompCC < sourceCC) { 1239 newNFDString.append(decompChar); 1240 decompIndex += U16_LENGTH(decompChar); 1241 } else if(decompChar != sourceChar) { 1242 // Blocked because same combining class. 1243 return FALSE; 1244 } else { // match: decompChar == sourceChar 1245 newNFDString.append(decompChar); 1246 decompIndex += U16_LENGTH(decompChar); 1247 sourceIndex += U16_LENGTH(decompChar); 1248 sourceChar = U_SENTINEL; 1249 } 1250 } 1251 // We are at the end of at least one of the two inputs. 1252 if(sourceChar >= 0) { // more characters from nfdString but not from decomp 1253 if(sourceCC < decompCC) { 1254 // Appending the next source character to the composite would not be FCD. 1255 return FALSE; 1256 } 1257 newNFDString.append(nfdString, sourceIndex, 0x7fffffff); 1258 newString.append(nfdString, sourceIndex, 0x7fffffff); 1259 } else if(decompIndex < decomp.length()) { // more characters from decomp, not from nfdString 1260 newNFDString.append(decomp, decompIndex, 0x7fffffff); 1261 } 1262 U_ASSERT(nfd.isNormalized(newNFDString, errorCode)); 1263 U_ASSERT(fcd.isNormalized(newString, errorCode)); 1264 U_ASSERT(nfd.normalize(newString, errorCode) == newNFDString); // canonically equivalent 1265 return TRUE; 1266 } 1267 1268 UBool 1269 CollationBuilder::ignorePrefix(const UnicodeString &s, UErrorCode &errorCode) const { 1270 // Do not map non-FCD prefixes. 1271 return !isFCD(s, errorCode); 1272 } 1273 1274 UBool 1275 CollationBuilder::ignoreString(const UnicodeString &s, UErrorCode &errorCode) const { 1276 // Do not map non-FCD strings. 1277 // Do not map strings that start with Hangul syllables: We decompose those on the fly. 1278 return !isFCD(s, errorCode) || Hangul::isHangul(s.charAt(0)); 1279 } 1280 1281 UBool 1282 CollationBuilder::isFCD(const UnicodeString &s, UErrorCode &errorCode) const { 1283 return U_SUCCESS(errorCode) && fcd.isNormalized(s, errorCode); 1284 } 1285 1286 void 1287 CollationBuilder::closeOverComposites(UErrorCode &errorCode) { 1288 UnicodeSet composites(UNICODE_STRING_SIMPLE("[:NFD_QC=N:]"), errorCode); // Java: static final 1289 if(U_FAILURE(errorCode)) { return; } 1290 // Hangul is decomposed on the fly during collation. 1291 composites.remove(Hangul::HANGUL_BASE, Hangul::HANGUL_END); 1292 UnicodeString prefix; // empty 1293 UnicodeString nfdString; 1294 UnicodeSetIterator iter(composites); 1295 while(iter.next()) { 1296 U_ASSERT(!iter.isString()); 1297 nfd.getDecomposition(iter.getCodepoint(), nfdString); 1298 cesLength = dataBuilder->getCEs(nfdString, ces, 0); 1299 if(cesLength > Collation::MAX_EXPANSION_LENGTH) { 1300 // Too many CEs from the decomposition (unusual), ignore this composite. 1301 // We could add a capacity parameter to getCEs() and reallocate if necessary. 1302 // However, this can only really happen in contrived cases. 1303 continue; 1304 } 1305 const UnicodeString &composite(iter.getString()); 1306 addIfDifferent(prefix, composite, ces, cesLength, Collation::UNASSIGNED_CE32, errorCode); 1307 } 1308 } 1309 1310 uint32_t 1311 CollationBuilder::addIfDifferent(const UnicodeString &prefix, const UnicodeString &str, 1312 const int64_t newCEs[], int32_t newCEsLength, uint32_t ce32, 1313 UErrorCode &errorCode) { 1314 if(U_FAILURE(errorCode)) { return ce32; } 1315 int64_t oldCEs[Collation::MAX_EXPANSION_LENGTH]; 1316 int32_t oldCEsLength = dataBuilder->getCEs(prefix, str, oldCEs, 0); 1317 if(!sameCEs(newCEs, newCEsLength, oldCEs, oldCEsLength)) { 1318 if(ce32 == Collation::UNASSIGNED_CE32) { 1319 ce32 = dataBuilder->encodeCEs(newCEs, newCEsLength, errorCode); 1320 } 1321 dataBuilder->addCE32(prefix, str, ce32, errorCode); 1322 } 1323 return ce32; 1324 } 1325 1326 UBool 1327 CollationBuilder::sameCEs(const int64_t ces1[], int32_t ces1Length, 1328 const int64_t ces2[], int32_t ces2Length) { 1329 if(ces1Length != ces2Length) { 1330 return FALSE; 1331 } 1332 U_ASSERT(ces1Length <= Collation::MAX_EXPANSION_LENGTH); 1333 for(int32_t i = 0; i < ces1Length; ++i) { 1334 if(ces1[i] != ces2[i]) { return FALSE; } 1335 } 1336 return TRUE; 1337 } 1338 1339 #ifdef DEBUG_COLLATION_BUILDER 1340 1341 uint32_t 1342 alignWeightRight(uint32_t w) { 1343 if(w != 0) { 1344 while((w & 0xff) == 0) { w >>= 8; } 1345 } 1346 return w; 1347 } 1348 1349 #endif 1350 1351 void 1352 CollationBuilder::makeTailoredCEs(UErrorCode &errorCode) { 1353 if(U_FAILURE(errorCode)) { return; } 1354 1355 CollationWeights primaries, secondaries, tertiaries; 1356 int64_t *nodesArray = nodes.getBuffer(); 1357 1358 for(int32_t rpi = 0; rpi < rootPrimaryIndexes.size(); ++rpi) { 1359 int32_t i = rootPrimaryIndexes.elementAti(rpi); 1360 int64_t node = nodesArray[i]; 1361 uint32_t p = weight32FromNode(node); 1362 uint32_t s = p == 0 ? 0 : Collation::COMMON_WEIGHT16; 1363 uint32_t t = s; 1364 uint32_t q = 0; 1365 UBool pIsTailored = FALSE; 1366 UBool sIsTailored = FALSE; 1367 UBool tIsTailored = FALSE; 1368 #ifdef DEBUG_COLLATION_BUILDER 1369 printf("\nprimary %lx\n", (long)alignWeightRight(p)); 1370 #endif 1371 int32_t pIndex = p == 0 ? 0 : rootElements.findPrimary(p); 1372 int32_t nextIndex = nextIndexFromNode(node); 1373 while(nextIndex != 0) { 1374 i = nextIndex; 1375 node = nodesArray[i]; 1376 nextIndex = nextIndexFromNode(node); 1377 int32_t strength = strengthFromNode(node); 1378 if(strength == UCOL_QUATERNARY) { 1379 U_ASSERT(isTailoredNode(node)); 1380 #ifdef DEBUG_COLLATION_BUILDER 1381 printf(" quat+ "); 1382 #endif 1383 if(q == 3) { 1384 errorCode = U_BUFFER_OVERFLOW_ERROR; 1385 errorReason = "quaternary tailoring gap too small"; 1386 return; 1387 } 1388 ++q; 1389 } else { 1390 if(strength == UCOL_TERTIARY) { 1391 if(isTailoredNode(node)) { 1392 #ifdef DEBUG_COLLATION_BUILDER 1393 printf(" ter+ "); 1394 #endif 1395 if(!tIsTailored) { 1396 // First tailored tertiary node for [p, s]. 1397 int32_t tCount = countTailoredNodes(nodesArray, nextIndex, 1398 UCOL_TERTIARY) + 1; 1399 uint32_t tLimit; 1400 if(t == 0) { 1401 // Gap at the beginning of the tertiary CE range. 1402 t = rootElements.getTertiaryBoundary() - 0x100; 1403 tLimit = rootElements.getFirstTertiaryCE() & Collation::ONLY_TERTIARY_MASK; 1404 } else if(t == BEFORE_WEIGHT16) { 1405 tLimit = Collation::COMMON_WEIGHT16; 1406 } else if(!pIsTailored && !sIsTailored) { 1407 // p and s are root weights. 1408 tLimit = rootElements.getTertiaryAfter(pIndex, s, t); 1409 } else { 1410 // [p, s] is tailored. 1411 U_ASSERT(t == Collation::COMMON_WEIGHT16); 1412 tLimit = rootElements.getTertiaryBoundary(); 1413 } 1414 U_ASSERT(tLimit == 0x4000 || (tLimit & ~Collation::ONLY_TERTIARY_MASK) == 0); 1415 tertiaries.initForTertiary(); 1416 if(!tertiaries.allocWeights(t, tLimit, tCount)) { 1417 errorCode = U_BUFFER_OVERFLOW_ERROR; 1418 errorReason = "tertiary tailoring gap too small"; 1419 return; 1420 } 1421 tIsTailored = TRUE; 1422 } 1423 t = tertiaries.nextWeight(); 1424 U_ASSERT(t != 0xffffffff); 1425 } else { 1426 t = weight16FromNode(node); 1427 tIsTailored = FALSE; 1428 #ifdef DEBUG_COLLATION_BUILDER 1429 printf(" ter %lx\n", (long)alignWeightRight(t)); 1430 #endif 1431 } 1432 } else { 1433 if(strength == UCOL_SECONDARY) { 1434 if(isTailoredNode(node)) { 1435 #ifdef DEBUG_COLLATION_BUILDER 1436 printf(" sec+ "); 1437 #endif 1438 if(!sIsTailored) { 1439 // First tailored secondary node for p. 1440 int32_t sCount = countTailoredNodes(nodesArray, nextIndex, 1441 UCOL_SECONDARY) + 1; 1442 uint32_t sLimit; 1443 if(s == 0) { 1444 // Gap at the beginning of the secondary CE range. 1445 s = rootElements.getSecondaryBoundary() - 0x100; 1446 sLimit = rootElements.getFirstSecondaryCE() >> 16; 1447 } else if(s == BEFORE_WEIGHT16) { 1448 sLimit = Collation::COMMON_WEIGHT16; 1449 } else if(!pIsTailored) { 1450 // p is a root primary. 1451 sLimit = rootElements.getSecondaryAfter(pIndex, s); 1452 } else { 1453 // p is a tailored primary. 1454 U_ASSERT(s == Collation::COMMON_WEIGHT16); 1455 sLimit = rootElements.getSecondaryBoundary(); 1456 } 1457 if(s == Collation::COMMON_WEIGHT16) { 1458 // Do not tailor into the getSortKey() range of 1459 // compressed common secondaries. 1460 s = rootElements.getLastCommonSecondary(); 1461 } 1462 secondaries.initForSecondary(); 1463 if(!secondaries.allocWeights(s, sLimit, sCount)) { 1464 errorCode = U_BUFFER_OVERFLOW_ERROR; 1465 errorReason = "secondary tailoring gap too small"; 1466 return; 1467 } 1468 sIsTailored = TRUE; 1469 } 1470 s = secondaries.nextWeight(); 1471 U_ASSERT(s != 0xffffffff); 1472 } else { 1473 s = weight16FromNode(node); 1474 sIsTailored = FALSE; 1475 #ifdef DEBUG_COLLATION_BUILDER 1476 printf(" sec %lx\n", (long)alignWeightRight(s)); 1477 #endif 1478 } 1479 } else /* UCOL_PRIMARY */ { 1480 U_ASSERT(isTailoredNode(node)); 1481 #ifdef DEBUG_COLLATION_BUILDER 1482 printf("pri+ "); 1483 #endif 1484 if(!pIsTailored) { 1485 // First tailored primary node in this list. 1486 int32_t pCount = countTailoredNodes(nodesArray, nextIndex, 1487 UCOL_PRIMARY) + 1; 1488 UBool isCompressible = baseData->isCompressiblePrimary(p); 1489 uint32_t pLimit = 1490 rootElements.getPrimaryAfter(p, pIndex, isCompressible); 1491 primaries.initForPrimary(isCompressible); 1492 if(!primaries.allocWeights(p, pLimit, pCount)) { 1493 errorCode = U_BUFFER_OVERFLOW_ERROR; // TODO: introduce a more specific UErrorCode? 1494 errorReason = "primary tailoring gap too small"; 1495 return; 1496 } 1497 pIsTailored = TRUE; 1498 } 1499 p = primaries.nextWeight(); 1500 U_ASSERT(p != 0xffffffff); 1501 s = Collation::COMMON_WEIGHT16; 1502 sIsTailored = FALSE; 1503 } 1504 t = s == 0 ? 0 : Collation::COMMON_WEIGHT16; 1505 tIsTailored = FALSE; 1506 } 1507 q = 0; 1508 } 1509 if(isTailoredNode(node)) { 1510 nodesArray[i] = Collation::makeCE(p, s, t, q); 1511 #ifdef DEBUG_COLLATION_BUILDER 1512 printf("%016llx\n", (long long)nodesArray[i]); 1513 #endif 1514 } 1515 } 1516 } 1517 } 1518 1519 int32_t 1520 CollationBuilder::countTailoredNodes(const int64_t *nodesArray, int32_t i, int32_t strength) { 1521 int32_t count = 0; 1522 for(;;) { 1523 if(i == 0) { break; } 1524 int64_t node = nodesArray[i]; 1525 if(strengthFromNode(node) < strength) { break; } 1526 if(strengthFromNode(node) == strength) { 1527 if(isTailoredNode(node)) { 1528 ++count; 1529 } else { 1530 break; 1531 } 1532 } 1533 i = nextIndexFromNode(node); 1534 } 1535 return count; 1536 } 1537 1538 class CEFinalizer : public CollationDataBuilder::CEModifier { 1539 public: 1540 CEFinalizer(const int64_t *ces) : finalCEs(ces) {} 1541 virtual ~CEFinalizer(); 1542 virtual int64_t modifyCE32(uint32_t ce32) const { 1543 U_ASSERT(!Collation::isSpecialCE32(ce32)); 1544 if(CollationBuilder::isTempCE32(ce32)) { 1545 // retain case bits 1546 return finalCEs[CollationBuilder::indexFromTempCE32(ce32)] | ((ce32 & 0xc0) << 8); 1547 } else { 1548 return Collation::NO_CE; 1549 } 1550 } 1551 virtual int64_t modifyCE(int64_t ce) const { 1552 if(CollationBuilder::isTempCE(ce)) { 1553 // retain case bits 1554 return finalCEs[CollationBuilder::indexFromTempCE(ce)] | (ce & 0xc000); 1555 } else { 1556 return Collation::NO_CE; 1557 } 1558 } 1559 1560 private: 1561 const int64_t *finalCEs; 1562 }; 1563 1564 CEFinalizer::~CEFinalizer() {} 1565 1566 void 1567 CollationBuilder::finalizeCEs(UErrorCode &errorCode) { 1568 if(U_FAILURE(errorCode)) { return; } 1569 LocalPointer<CollationDataBuilder> newBuilder(new CollationDataBuilder(errorCode)); 1570 if(newBuilder.isNull()) { 1571 errorCode = U_MEMORY_ALLOCATION_ERROR; 1572 return; 1573 } 1574 newBuilder->initForTailoring(baseData, errorCode); 1575 CEFinalizer finalizer(nodes.getBuffer()); 1576 newBuilder->copyFrom(*dataBuilder, finalizer, errorCode); 1577 if(U_FAILURE(errorCode)) { return; } 1578 delete dataBuilder; 1579 dataBuilder = newBuilder.orphan(); 1580 } 1581 1582 int32_t 1583 CollationBuilder::ceStrength(int64_t ce) { 1584 return 1585 isTempCE(ce) ? strengthFromTempCE(ce) : 1586 (ce & INT64_C(0xff00000000000000)) != 0 ? UCOL_PRIMARY : 1587 ((uint32_t)ce & 0xff000000) != 0 ? UCOL_SECONDARY : 1588 ce != 0 ? UCOL_TERTIARY : 1589 UCOL_IDENTICAL; 1590 } 1591 1592 U_NAMESPACE_END 1593 1594 U_NAMESPACE_USE 1595 1596 U_CAPI UCollator * U_EXPORT2 1597 ucol_openRules(const UChar *rules, int32_t rulesLength, 1598 UColAttributeValue normalizationMode, UCollationStrength strength, 1599 UParseError *parseError, UErrorCode *pErrorCode) { 1600 if(U_FAILURE(*pErrorCode)) { return NULL; } 1601 if(rules == NULL && rulesLength != 0) { 1602 *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; 1603 return NULL; 1604 } 1605 RuleBasedCollator *coll = new RuleBasedCollator(); 1606 if(coll == NULL) { 1607 *pErrorCode = U_MEMORY_ALLOCATION_ERROR; 1608 return NULL; 1609 } 1610 UnicodeString r((UBool)(rulesLength < 0), rules, rulesLength); 1611 coll->internalBuildTailoring(r, strength, normalizationMode, parseError, NULL, *pErrorCode); 1612 if(U_FAILURE(*pErrorCode)) { 1613 delete coll; 1614 return NULL; 1615 } 1616 return coll->toUCollator(); 1617 } 1618 1619 static const int32_t internalBufferSize = 512; 1620 1621 // The @internal ucol_getUnsafeSet() was moved here from ucol_sit.cpp 1622 // because it calls UnicodeSet "builder" code that depends on all Unicode properties, 1623 // and the rest of the collation "runtime" code only depends on normalization. 1624 // This function is not related to the collation builder, 1625 // but it did not seem worth moving it into its own .cpp file, 1626 // nor rewriting it to use lower-level UnicodeSet and Normalizer2Impl methods. 1627 U_CAPI int32_t U_EXPORT2 1628 ucol_getUnsafeSet( const UCollator *coll, 1629 USet *unsafe, 1630 UErrorCode *status) 1631 { 1632 UChar buffer[internalBufferSize]; 1633 int32_t len = 0; 1634 1635 uset_clear(unsafe); 1636 1637 // cccpattern = "[[:^tccc=0:][:^lccc=0:]]", unfortunately variant 1638 static const UChar cccpattern[25] = { 0x5b, 0x5b, 0x3a, 0x5e, 0x74, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 1639 0x5b, 0x3a, 0x5e, 0x6c, 0x63, 0x63, 0x63, 0x3d, 0x30, 0x3a, 0x5d, 0x5d, 0x00 }; 1640 1641 // add chars that fail the fcd check 1642 uset_applyPattern(unsafe, cccpattern, 24, USET_IGNORE_SPACE, status); 1643 1644 // add lead/trail surrogates 1645 // (trail surrogates should need to be unsafe only if the caller tests for UTF-16 code *units*, 1646 // not when testing code *points*) 1647 uset_addRange(unsafe, 0xd800, 0xdfff); 1648 1649 USet *contractions = uset_open(0,0); 1650 1651 int32_t i = 0, j = 0; 1652 ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status); 1653 int32_t contsSize = uset_size(contractions); 1654 UChar32 c = 0; 1655 // Contraction set consists only of strings 1656 // to get unsafe code points, we need to 1657 // break the strings apart and add them to the unsafe set 1658 for(i = 0; i < contsSize; i++) { 1659 len = uset_getItem(contractions, i, NULL, NULL, buffer, internalBufferSize, status); 1660 if(len > 0) { 1661 j = 0; 1662 while(j < len) { 1663 U16_NEXT(buffer, j, len, c); 1664 if(j < len) { 1665 uset_add(unsafe, c); 1666 } 1667 } 1668 } 1669 } 1670 1671 uset_close(contractions); 1672 1673 return uset_size(unsafe); 1674 } 1675 1676 #endif // !UCONFIG_NO_COLLATION 1677