1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * rulebasedcollator.cpp 9 * 10 * (replaced the former tblcoll.cpp) 11 * 12 * created on: 2012feb14 with new and old collation code 13 * created by: Markus W. Scherer 14 */ 15 16 #include "unicode/utypes.h" 17 18 #if !UCONFIG_NO_COLLATION 19 20 #include "unicode/coll.h" 21 #include "unicode/coleitr.h" 22 #include "unicode/localpointer.h" 23 #include "unicode/locid.h" 24 #include "unicode/sortkey.h" 25 #include "unicode/tblcoll.h" 26 #include "unicode/ucol.h" 27 #include "unicode/uiter.h" 28 #include "unicode/uloc.h" 29 #include "unicode/uniset.h" 30 #include "unicode/unistr.h" 31 #include "unicode/usetiter.h" 32 #include "unicode/utf8.h" 33 #include "unicode/uversion.h" 34 #include "bocsu.h" 35 #include "charstr.h" 36 #include "cmemory.h" 37 #include "collation.h" 38 #include "collationcompare.h" 39 #include "collationdata.h" 40 #include "collationdatareader.h" 41 #include "collationfastlatin.h" 42 #include "collationiterator.h" 43 #include "collationkeys.h" 44 #include "collationroot.h" 45 #include "collationsets.h" 46 #include "collationsettings.h" 47 #include "collationtailoring.h" 48 #include "cstring.h" 49 #include "uassert.h" 50 #include "ucol_imp.h" 51 #include "uhash.h" 52 #include "uitercollationiterator.h" 53 #include "ustr_imp.h" 54 #include "utf16collationiterator.h" 55 #include "utf8collationiterator.h" 56 #include "uvectr64.h" 57 58 U_NAMESPACE_BEGIN 59 60 namespace { 61 62 class FixedSortKeyByteSink : public SortKeyByteSink { 63 public: 64 FixedSortKeyByteSink(char *dest, int32_t destCapacity) 65 : SortKeyByteSink(dest, destCapacity) {} 66 virtual ~FixedSortKeyByteSink(); 67 68 private: 69 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length); 70 virtual UBool Resize(int32_t appendCapacity, int32_t length); 71 }; 72 73 FixedSortKeyByteSink::~FixedSortKeyByteSink() {} 74 75 void 76 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) { 77 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ 78 // Fill the buffer completely. 79 int32_t available = capacity_ - length; 80 if (available > 0) { 81 uprv_memcpy(buffer_ + length, bytes, available); 82 } 83 } 84 85 UBool 86 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) { 87 return FALSE; 88 } 89 90 } // namespace 91 92 // Not in an anonymous namespace, so that it can be a friend of CollationKey. 93 class CollationKeyByteSink : public SortKeyByteSink { 94 public: 95 CollationKeyByteSink(CollationKey &key) 96 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()), 97 key_(key) {} 98 virtual ~CollationKeyByteSink(); 99 100 private: 101 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length); 102 virtual UBool Resize(int32_t appendCapacity, int32_t length); 103 104 CollationKey &key_; 105 }; 106 107 CollationKeyByteSink::~CollationKeyByteSink() {} 108 109 void 110 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) { 111 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ 112 if (Resize(n, length)) { 113 uprv_memcpy(buffer_ + length, bytes, n); 114 } 115 } 116 117 UBool 118 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) { 119 if (buffer_ == NULL) { 120 return FALSE; // allocation failed before already 121 } 122 int32_t newCapacity = 2 * capacity_; 123 int32_t altCapacity = length + 2 * appendCapacity; 124 if (newCapacity < altCapacity) { 125 newCapacity = altCapacity; 126 } 127 if (newCapacity < 200) { 128 newCapacity = 200; 129 } 130 uint8_t *newBuffer = key_.reallocate(newCapacity, length); 131 if (newBuffer == NULL) { 132 SetNotOk(); 133 return FALSE; 134 } 135 buffer_ = reinterpret_cast<char *>(newBuffer); 136 capacity_ = newCapacity; 137 return TRUE; 138 } 139 140 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator &other) 141 : Collator(other), 142 data(other.data), 143 settings(other.settings), 144 tailoring(other.tailoring), 145 cacheEntry(other.cacheEntry), 146 validLocale(other.validLocale), 147 explicitlySetAttributes(other.explicitlySetAttributes), 148 actualLocaleIsSameAsValid(other.actualLocaleIsSameAsValid) { 149 settings->addRef(); 150 cacheEntry->addRef(); 151 } 152 153 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length, 154 const RuleBasedCollator *base, UErrorCode &errorCode) 155 : data(NULL), 156 settings(NULL), 157 tailoring(NULL), 158 cacheEntry(NULL), 159 validLocale(""), 160 explicitlySetAttributes(0), 161 actualLocaleIsSameAsValid(FALSE) { 162 if(U_FAILURE(errorCode)) { return; } 163 if(bin == NULL || length == 0 || base == NULL) { 164 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 165 return; 166 } 167 const CollationTailoring *root = CollationRoot::getRoot(errorCode); 168 if(U_FAILURE(errorCode)) { return; } 169 if(base->tailoring != root) { 170 errorCode = U_UNSUPPORTED_ERROR; 171 return; 172 } 173 LocalPointer<CollationTailoring> t(new CollationTailoring(base->tailoring->settings)); 174 if(t.isNull() || t->isBogus()) { 175 errorCode = U_MEMORY_ALLOCATION_ERROR; 176 return; 177 } 178 CollationDataReader::read(base->tailoring, bin, length, *t, errorCode); 179 if(U_FAILURE(errorCode)) { return; } 180 t->actualLocale.setToBogus(); 181 adoptTailoring(t.orphan(), errorCode); 182 } 183 184 RuleBasedCollator::RuleBasedCollator(const CollationCacheEntry *entry) 185 : data(entry->tailoring->data), 186 settings(entry->tailoring->settings), 187 tailoring(entry->tailoring), 188 cacheEntry(entry), 189 validLocale(entry->validLocale), 190 explicitlySetAttributes(0), 191 actualLocaleIsSameAsValid(FALSE) { 192 settings->addRef(); 193 cacheEntry->addRef(); 194 } 195 196 RuleBasedCollator::~RuleBasedCollator() { 197 SharedObject::clearPtr(settings); 198 SharedObject::clearPtr(cacheEntry); 199 } 200 201 void 202 RuleBasedCollator::adoptTailoring(CollationTailoring *t, UErrorCode &errorCode) { 203 if(U_FAILURE(errorCode)) { 204 t->deleteIfZeroRefCount(); 205 return; 206 } 207 U_ASSERT(settings == NULL && data == NULL && tailoring == NULL && cacheEntry == NULL); 208 cacheEntry = new CollationCacheEntry(t->actualLocale, t); 209 if(cacheEntry == NULL) { 210 errorCode = U_MEMORY_ALLOCATION_ERROR; 211 t->deleteIfZeroRefCount(); 212 return; 213 } 214 data = t->data; 215 settings = t->settings; 216 settings->addRef(); 217 tailoring = t; 218 cacheEntry->addRef(); 219 validLocale = t->actualLocale; 220 actualLocaleIsSameAsValid = FALSE; 221 } 222 223 Collator * 224 RuleBasedCollator::clone() const { 225 return new RuleBasedCollator(*this); 226 } 227 228 RuleBasedCollator &RuleBasedCollator::operator=(const RuleBasedCollator &other) { 229 if(this == &other) { return *this; } 230 SharedObject::copyPtr(other.settings, settings); 231 tailoring = other.tailoring; 232 SharedObject::copyPtr(other.cacheEntry, cacheEntry); 233 data = tailoring->data; 234 validLocale = other.validLocale; 235 explicitlySetAttributes = other.explicitlySetAttributes; 236 actualLocaleIsSameAsValid = other.actualLocaleIsSameAsValid; 237 return *this; 238 } 239 240 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator) 241 242 UBool 243 RuleBasedCollator::operator==(const Collator& other) const { 244 if(this == &other) { return TRUE; } 245 if(!Collator::operator==(other)) { return FALSE; } 246 const RuleBasedCollator &o = static_cast<const RuleBasedCollator &>(other); 247 if(*settings != *o.settings) { return FALSE; } 248 if(data == o.data) { return TRUE; } 249 UBool thisIsRoot = data->base == NULL; 250 UBool otherIsRoot = o.data->base == NULL; 251 U_ASSERT(!thisIsRoot || !otherIsRoot); // otherwise their data pointers should be == 252 if(thisIsRoot != otherIsRoot) { return FALSE; } 253 if((thisIsRoot || !tailoring->rules.isEmpty()) && 254 (otherIsRoot || !o.tailoring->rules.isEmpty())) { 255 // Shortcut: If both collators have valid rule strings, then compare those. 256 if(tailoring->rules == o.tailoring->rules) { return TRUE; } 257 } 258 // Different rule strings can result in the same or equivalent tailoring. 259 // The rule strings are optional in ICU resource bundles, although included by default. 260 // cloneBinary() drops the rule string. 261 UErrorCode errorCode = U_ZERO_ERROR; 262 LocalPointer<UnicodeSet> thisTailored(getTailoredSet(errorCode)); 263 LocalPointer<UnicodeSet> otherTailored(o.getTailoredSet(errorCode)); 264 if(U_FAILURE(errorCode)) { return FALSE; } 265 if(*thisTailored != *otherTailored) { return FALSE; } 266 // For completeness, we should compare all of the mappings; 267 // or we should create a list of strings, sort it with one collator, 268 // and check if both collators compare adjacent strings the same 269 // (order & strength, down to quaternary); or similar. 270 // Testing equality of collators seems unusual. 271 return TRUE; 272 } 273 274 int32_t 275 RuleBasedCollator::hashCode() const { 276 int32_t h = settings->hashCode(); 277 if(data->base == NULL) { return h; } // root collator 278 // Do not rely on the rule string, see comments in operator==(). 279 UErrorCode errorCode = U_ZERO_ERROR; 280 LocalPointer<UnicodeSet> set(getTailoredSet(errorCode)); 281 if(U_FAILURE(errorCode)) { return 0; } 282 UnicodeSetIterator iter(*set); 283 while(iter.next() && !iter.isString()) { 284 h ^= data->getCE32(iter.getCodepoint()); 285 } 286 return h; 287 } 288 289 void 290 RuleBasedCollator::setLocales(const Locale &requested, const Locale &valid, 291 const Locale &actual) { 292 if(actual == tailoring->actualLocale) { 293 actualLocaleIsSameAsValid = FALSE; 294 } else { 295 U_ASSERT(actual == valid); 296 actualLocaleIsSameAsValid = TRUE; 297 } 298 // Do not modify tailoring.actualLocale: 299 // We cannot be sure that that would be thread-safe. 300 validLocale = valid; 301 (void)requested; // Ignore, see also ticket #10477. 302 } 303 304 Locale 305 RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode& errorCode) const { 306 if(U_FAILURE(errorCode)) { 307 return Locale::getRoot(); 308 } 309 switch(type) { 310 case ULOC_ACTUAL_LOCALE: 311 return actualLocaleIsSameAsValid ? validLocale : tailoring->actualLocale; 312 case ULOC_VALID_LOCALE: 313 return validLocale; 314 case ULOC_REQUESTED_LOCALE: 315 default: 316 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 317 return Locale::getRoot(); 318 } 319 } 320 321 const char * 322 RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const { 323 if(U_FAILURE(errorCode)) { 324 return NULL; 325 } 326 const Locale *result; 327 switch(type) { 328 case ULOC_ACTUAL_LOCALE: 329 result = actualLocaleIsSameAsValid ? &validLocale : &tailoring->actualLocale; 330 break; 331 case ULOC_VALID_LOCALE: 332 result = &validLocale; 333 break; 334 case ULOC_REQUESTED_LOCALE: 335 default: 336 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 337 return NULL; 338 } 339 if(result->isBogus()) { return NULL; } 340 const char *id = result->getName(); 341 return id[0] == 0 ? "root" : id; 342 } 343 344 const UnicodeString& 345 RuleBasedCollator::getRules() const { 346 return tailoring->rules; 347 } 348 349 void 350 RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) const { 351 if(delta == UCOL_TAILORING_ONLY) { 352 buffer = tailoring->rules; 353 return; 354 } 355 // UCOL_FULL_RULES 356 buffer.remove(); 357 CollationLoader::appendRootRules(buffer); 358 buffer.append(tailoring->rules).getTerminatedBuffer(); 359 } 360 361 void 362 RuleBasedCollator::getVersion(UVersionInfo version) const { 363 uprv_memcpy(version, tailoring->version, U_MAX_VERSION_LENGTH); 364 version[0] += (UCOL_RUNTIME_VERSION << 4) + (UCOL_RUNTIME_VERSION >> 4); 365 } 366 367 UnicodeSet * 368 RuleBasedCollator::getTailoredSet(UErrorCode &errorCode) const { 369 if(U_FAILURE(errorCode)) { return NULL; } 370 UnicodeSet *tailored = new UnicodeSet(); 371 if(tailored == NULL) { 372 errorCode = U_MEMORY_ALLOCATION_ERROR; 373 return NULL; 374 } 375 if(data->base != NULL) { 376 TailoredSet(tailored).forData(data, errorCode); 377 if(U_FAILURE(errorCode)) { 378 delete tailored; 379 return NULL; 380 } 381 } 382 return tailored; 383 } 384 385 void 386 RuleBasedCollator::internalGetContractionsAndExpansions( 387 UnicodeSet *contractions, UnicodeSet *expansions, 388 UBool addPrefixes, UErrorCode &errorCode) const { 389 if(U_FAILURE(errorCode)) { return; } 390 if(contractions != NULL) { 391 contractions->clear(); 392 } 393 if(expansions != NULL) { 394 expansions->clear(); 395 } 396 ContractionsAndExpansions(contractions, expansions, NULL, addPrefixes).forData(data, errorCode); 397 } 398 399 void 400 RuleBasedCollator::internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const { 401 if(U_FAILURE(errorCode)) { return; } 402 ContractionsAndExpansions(&set, NULL, NULL, FALSE).forCodePoint(data, c, errorCode); 403 } 404 405 const CollationSettings & 406 RuleBasedCollator::getDefaultSettings() const { 407 return *tailoring->settings; 408 } 409 410 UColAttributeValue 411 RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCode &errorCode) const { 412 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; } 413 int32_t option; 414 switch(attr) { 415 case UCOL_FRENCH_COLLATION: 416 option = CollationSettings::BACKWARD_SECONDARY; 417 break; 418 case UCOL_ALTERNATE_HANDLING: 419 return settings->getAlternateHandling(); 420 case UCOL_CASE_FIRST: 421 return settings->getCaseFirst(); 422 case UCOL_CASE_LEVEL: 423 option = CollationSettings::CASE_LEVEL; 424 break; 425 case UCOL_NORMALIZATION_MODE: 426 option = CollationSettings::CHECK_FCD; 427 break; 428 case UCOL_STRENGTH: 429 return (UColAttributeValue)settings->getStrength(); 430 case UCOL_HIRAGANA_QUATERNARY_MODE: 431 // Deprecated attribute, unsettable. 432 return UCOL_OFF; 433 case UCOL_NUMERIC_COLLATION: 434 option = CollationSettings::NUMERIC; 435 break; 436 default: 437 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 438 return UCOL_DEFAULT; 439 } 440 return ((settings->options & option) == 0) ? UCOL_OFF : UCOL_ON; 441 } 442 443 void 444 RuleBasedCollator::setAttribute(UColAttribute attr, UColAttributeValue value, 445 UErrorCode &errorCode) { 446 UColAttributeValue oldValue = getAttribute(attr, errorCode); 447 if(U_FAILURE(errorCode)) { return; } 448 if(value == oldValue) { 449 setAttributeExplicitly(attr); 450 return; 451 } 452 const CollationSettings &defaultSettings = getDefaultSettings(); 453 if(settings == &defaultSettings) { 454 if(value == UCOL_DEFAULT) { 455 setAttributeDefault(attr); 456 return; 457 } 458 } 459 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 460 if(ownedSettings == NULL) { 461 errorCode = U_MEMORY_ALLOCATION_ERROR; 462 return; 463 } 464 465 switch(attr) { 466 case UCOL_FRENCH_COLLATION: 467 ownedSettings->setFlag(CollationSettings::BACKWARD_SECONDARY, value, 468 defaultSettings.options, errorCode); 469 break; 470 case UCOL_ALTERNATE_HANDLING: 471 ownedSettings->setAlternateHandling(value, defaultSettings.options, errorCode); 472 break; 473 case UCOL_CASE_FIRST: 474 ownedSettings->setCaseFirst(value, defaultSettings.options, errorCode); 475 break; 476 case UCOL_CASE_LEVEL: 477 ownedSettings->setFlag(CollationSettings::CASE_LEVEL, value, 478 defaultSettings.options, errorCode); 479 break; 480 case UCOL_NORMALIZATION_MODE: 481 ownedSettings->setFlag(CollationSettings::CHECK_FCD, value, 482 defaultSettings.options, errorCode); 483 break; 484 case UCOL_STRENGTH: 485 ownedSettings->setStrength(value, defaultSettings.options, errorCode); 486 break; 487 case UCOL_HIRAGANA_QUATERNARY_MODE: 488 // Deprecated attribute. Check for valid values but do not change anything. 489 if(value != UCOL_OFF && value != UCOL_ON && value != UCOL_DEFAULT) { 490 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 491 } 492 break; 493 case UCOL_NUMERIC_COLLATION: 494 ownedSettings->setFlag(CollationSettings::NUMERIC, value, defaultSettings.options, errorCode); 495 break; 496 default: 497 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 498 break; 499 } 500 if(U_FAILURE(errorCode)) { return; } 501 setFastLatinOptions(*ownedSettings); 502 if(value == UCOL_DEFAULT) { 503 setAttributeDefault(attr); 504 } else { 505 setAttributeExplicitly(attr); 506 } 507 } 508 509 Collator & 510 RuleBasedCollator::setMaxVariable(UColReorderCode group, UErrorCode &errorCode) { 511 if(U_FAILURE(errorCode)) { return *this; } 512 // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1. 513 int32_t value; 514 if(group == UCOL_REORDER_CODE_DEFAULT) { 515 value = UCOL_DEFAULT; 516 } else if(UCOL_REORDER_CODE_FIRST <= group && group <= UCOL_REORDER_CODE_CURRENCY) { 517 value = group - UCOL_REORDER_CODE_FIRST; 518 } else { 519 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 520 return *this; 521 } 522 CollationSettings::MaxVariable oldValue = settings->getMaxVariable(); 523 if(value == oldValue) { 524 setAttributeExplicitly(ATTR_VARIABLE_TOP); 525 return *this; 526 } 527 const CollationSettings &defaultSettings = getDefaultSettings(); 528 if(settings == &defaultSettings) { 529 if(value == UCOL_DEFAULT) { 530 setAttributeDefault(ATTR_VARIABLE_TOP); 531 return *this; 532 } 533 } 534 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 535 if(ownedSettings == NULL) { 536 errorCode = U_MEMORY_ALLOCATION_ERROR; 537 return *this; 538 } 539 540 if(group == UCOL_REORDER_CODE_DEFAULT) { 541 group = (UColReorderCode)(UCOL_REORDER_CODE_FIRST + defaultSettings.getMaxVariable()); 542 } 543 uint32_t varTop = data->getLastPrimaryForGroup(group); 544 U_ASSERT(varTop != 0); 545 ownedSettings->setMaxVariable(value, defaultSettings.options, errorCode); 546 if(U_FAILURE(errorCode)) { return *this; } 547 ownedSettings->variableTop = varTop; 548 setFastLatinOptions(*ownedSettings); 549 if(value == UCOL_DEFAULT) { 550 setAttributeDefault(ATTR_VARIABLE_TOP); 551 } else { 552 setAttributeExplicitly(ATTR_VARIABLE_TOP); 553 } 554 return *this; 555 } 556 557 UColReorderCode 558 RuleBasedCollator::getMaxVariable() const { 559 return (UColReorderCode)(UCOL_REORDER_CODE_FIRST + settings->getMaxVariable()); 560 } 561 562 uint32_t 563 RuleBasedCollator::getVariableTop(UErrorCode & /*errorCode*/) const { 564 return settings->variableTop; 565 } 566 567 uint32_t 568 RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &errorCode) { 569 if(U_FAILURE(errorCode)) { return 0; } 570 if(varTop == NULL && len !=0) { 571 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 572 return 0; 573 } 574 if(len < 0) { len = u_strlen(varTop); } 575 if(len == 0) { 576 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 577 return 0; 578 } 579 UBool numeric = settings->isNumeric(); 580 int64_t ce1, ce2; 581 if(settings->dontCheckFCD()) { 582 UTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len); 583 ce1 = ci.nextCE(errorCode); 584 ce2 = ci.nextCE(errorCode); 585 } else { 586 FCDUTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len); 587 ce1 = ci.nextCE(errorCode); 588 ce2 = ci.nextCE(errorCode); 589 } 590 if(ce1 == Collation::NO_CE || ce2 != Collation::NO_CE) { 591 errorCode = U_CE_NOT_FOUND_ERROR; 592 return 0; 593 } 594 setVariableTop((uint32_t)(ce1 >> 32), errorCode); 595 return settings->variableTop; 596 } 597 598 uint32_t 599 RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &errorCode) { 600 return setVariableTop(varTop.getBuffer(), varTop.length(), errorCode); 601 } 602 603 void 604 RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &errorCode) { 605 if(U_FAILURE(errorCode)) { return; } 606 if(varTop != settings->variableTop) { 607 // Pin the variable top to the end of the reordering group which contains it. 608 // Only a few special groups are supported. 609 int32_t group = data->getGroupForPrimary(varTop); 610 if(group < UCOL_REORDER_CODE_FIRST || UCOL_REORDER_CODE_CURRENCY < group) { 611 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 612 return; 613 } 614 uint32_t v = data->getLastPrimaryForGroup(group); 615 U_ASSERT(v != 0 && v >= varTop); 616 varTop = v; 617 if(varTop != settings->variableTop) { 618 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 619 if(ownedSettings == NULL) { 620 errorCode = U_MEMORY_ALLOCATION_ERROR; 621 return; 622 } 623 ownedSettings->setMaxVariable(group - UCOL_REORDER_CODE_FIRST, 624 getDefaultSettings().options, errorCode); 625 if(U_FAILURE(errorCode)) { return; } 626 ownedSettings->variableTop = varTop; 627 setFastLatinOptions(*ownedSettings); 628 } 629 } 630 if(varTop == getDefaultSettings().variableTop) { 631 setAttributeDefault(ATTR_VARIABLE_TOP); 632 } else { 633 setAttributeExplicitly(ATTR_VARIABLE_TOP); 634 } 635 } 636 637 int32_t 638 RuleBasedCollator::getReorderCodes(int32_t *dest, int32_t capacity, 639 UErrorCode &errorCode) const { 640 if(U_FAILURE(errorCode)) { return 0; } 641 if(capacity < 0 || (dest == NULL && capacity > 0)) { 642 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 643 return 0; 644 } 645 int32_t length = settings->reorderCodesLength; 646 if(length == 0) { return 0; } 647 if(length > capacity) { 648 errorCode = U_BUFFER_OVERFLOW_ERROR; 649 return length; 650 } 651 uprv_memcpy(dest, settings->reorderCodes, length * 4); 652 return length; 653 } 654 655 void 656 RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length, 657 UErrorCode &errorCode) { 658 if(U_FAILURE(errorCode)) { return; } 659 if(length < 0 || (reorderCodes == NULL && length > 0)) { 660 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 661 return; 662 } 663 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_NONE) { 664 length = 0; 665 } 666 if(length == settings->reorderCodesLength && 667 uprv_memcmp(reorderCodes, settings->reorderCodes, length * 4) == 0) { 668 return; 669 } 670 const CollationSettings &defaultSettings = getDefaultSettings(); 671 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_DEFAULT) { 672 if(settings != &defaultSettings) { 673 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 674 if(ownedSettings == NULL) { 675 errorCode = U_MEMORY_ALLOCATION_ERROR; 676 return; 677 } 678 ownedSettings->copyReorderingFrom(defaultSettings, errorCode); 679 setFastLatinOptions(*ownedSettings); 680 } 681 return; 682 } 683 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 684 if(ownedSettings == NULL) { 685 errorCode = U_MEMORY_ALLOCATION_ERROR; 686 return; 687 } 688 ownedSettings->setReordering(*data, reorderCodes, length, errorCode); 689 setFastLatinOptions(*ownedSettings); 690 } 691 692 void 693 RuleBasedCollator::setFastLatinOptions(CollationSettings &ownedSettings) const { 694 ownedSettings.fastLatinOptions = CollationFastLatin::getOptions( 695 data, ownedSettings, 696 ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries)); 697 } 698 699 UCollationResult 700 RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right, 701 UErrorCode &errorCode) const { 702 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 703 return doCompare(left.getBuffer(), left.length(), 704 right.getBuffer(), right.length(), errorCode); 705 } 706 707 UCollationResult 708 RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right, 709 int32_t length, UErrorCode &errorCode) const { 710 if(U_FAILURE(errorCode) || length == 0) { return UCOL_EQUAL; } 711 if(length < 0) { 712 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 713 return UCOL_EQUAL; 714 } 715 int32_t leftLength = left.length(); 716 int32_t rightLength = right.length(); 717 if(leftLength > length) { leftLength = length; } 718 if(rightLength > length) { rightLength = length; } 719 return doCompare(left.getBuffer(), leftLength, 720 right.getBuffer(), rightLength, errorCode); 721 } 722 723 UCollationResult 724 RuleBasedCollator::compare(const UChar *left, int32_t leftLength, 725 const UChar *right, int32_t rightLength, 726 UErrorCode &errorCode) const { 727 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 728 if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) { 729 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 730 return UCOL_EQUAL; 731 } 732 // Make sure both or neither strings have a known length. 733 // We do not optimize for mixed length/termination. 734 if(leftLength >= 0) { 735 if(rightLength < 0) { rightLength = u_strlen(right); } 736 } else { 737 if(rightLength >= 0) { leftLength = u_strlen(left); } 738 } 739 return doCompare(left, leftLength, right, rightLength, errorCode); 740 } 741 742 UCollationResult 743 RuleBasedCollator::compareUTF8(const StringPiece &left, const StringPiece &right, 744 UErrorCode &errorCode) const { 745 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 746 const uint8_t *leftBytes = reinterpret_cast<const uint8_t *>(left.data()); 747 const uint8_t *rightBytes = reinterpret_cast<const uint8_t *>(right.data()); 748 if((leftBytes == NULL && !left.empty()) || (rightBytes == NULL && !right.empty())) { 749 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 750 return UCOL_EQUAL; 751 } 752 return doCompare(leftBytes, left.length(), rightBytes, right.length(), errorCode); 753 } 754 755 UCollationResult 756 RuleBasedCollator::internalCompareUTF8(const char *left, int32_t leftLength, 757 const char *right, int32_t rightLength, 758 UErrorCode &errorCode) const { 759 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 760 if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) { 761 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 762 return UCOL_EQUAL; 763 } 764 // Make sure both or neither strings have a known length. 765 // We do not optimize for mixed length/termination. 766 if(leftLength >= 0) { 767 if(rightLength < 0) { rightLength = uprv_strlen(right); } 768 } else { 769 if(rightLength >= 0) { leftLength = uprv_strlen(left); } 770 } 771 return doCompare(reinterpret_cast<const uint8_t *>(left), leftLength, 772 reinterpret_cast<const uint8_t *>(right), rightLength, errorCode); 773 } 774 775 namespace { 776 777 /** 778 * Abstract iterator for identical-level string comparisons. 779 * Returns FCD code points and handles temporary switching to NFD. 780 */ 781 class NFDIterator : public UObject { 782 public: 783 NFDIterator() : index(-1), length(0) {} 784 virtual ~NFDIterator() {} 785 /** 786 * Returns the next code point from the internal normalization buffer, 787 * or else the next text code point. 788 * Returns -1 at the end of the text. 789 */ 790 UChar32 nextCodePoint() { 791 if(index >= 0) { 792 if(index == length) { 793 index = -1; 794 } else { 795 UChar32 c; 796 U16_NEXT_UNSAFE(decomp, index, c); 797 return c; 798 } 799 } 800 return nextRawCodePoint(); 801 } 802 /** 803 * @param nfcImpl 804 * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint() 805 * @return the first code point in c's decomposition, 806 * or c itself if it was decomposed already or if it does not decompose 807 */ 808 UChar32 nextDecomposedCodePoint(const Normalizer2Impl &nfcImpl, UChar32 c) { 809 if(index >= 0) { return c; } 810 decomp = nfcImpl.getDecomposition(c, buffer, length); 811 if(decomp == NULL) { return c; } 812 index = 0; 813 U16_NEXT_UNSAFE(decomp, index, c); 814 return c; 815 } 816 protected: 817 /** 818 * Returns the next text code point in FCD order. 819 * Returns -1 at the end of the text. 820 */ 821 virtual UChar32 nextRawCodePoint() = 0; 822 private: 823 const UChar *decomp; 824 UChar buffer[4]; 825 int32_t index; 826 int32_t length; 827 }; 828 829 class UTF16NFDIterator : public NFDIterator { 830 public: 831 UTF16NFDIterator(const UChar *text, const UChar *textLimit) : s(text), limit(textLimit) {} 832 protected: 833 virtual UChar32 nextRawCodePoint() { 834 if(s == limit) { return U_SENTINEL; } 835 UChar32 c = *s++; 836 if(limit == NULL && c == 0) { 837 s = NULL; 838 return U_SENTINEL; 839 } 840 UChar trail; 841 if(U16_IS_LEAD(c) && s != limit && U16_IS_TRAIL(trail = *s)) { 842 ++s; 843 c = U16_GET_SUPPLEMENTARY(c, trail); 844 } 845 return c; 846 } 847 848 const UChar *s; 849 const UChar *limit; 850 }; 851 852 class FCDUTF16NFDIterator : public UTF16NFDIterator { 853 public: 854 FCDUTF16NFDIterator(const Normalizer2Impl &nfcImpl, const UChar *text, const UChar *textLimit) 855 : UTF16NFDIterator(NULL, NULL) { 856 UErrorCode errorCode = U_ZERO_ERROR; 857 const UChar *spanLimit = nfcImpl.makeFCD(text, textLimit, NULL, errorCode); 858 if(U_FAILURE(errorCode)) { return; } 859 if(spanLimit == textLimit || (textLimit == NULL && *spanLimit == 0)) { 860 s = text; 861 limit = spanLimit; 862 } else { 863 str.setTo(text, (int32_t)(spanLimit - text)); 864 { 865 ReorderingBuffer buffer(nfcImpl, str); 866 if(buffer.init(str.length(), errorCode)) { 867 nfcImpl.makeFCD(spanLimit, textLimit, &buffer, errorCode); 868 } 869 } 870 if(U_SUCCESS(errorCode)) { 871 s = str.getBuffer(); 872 limit = s + str.length(); 873 } 874 } 875 } 876 private: 877 UnicodeString str; 878 }; 879 880 class UTF8NFDIterator : public NFDIterator { 881 public: 882 UTF8NFDIterator(const uint8_t *text, int32_t textLength) 883 : s(text), pos(0), length(textLength) {} 884 protected: 885 virtual UChar32 nextRawCodePoint() { 886 if(pos == length || (s[pos] == 0 && length < 0)) { return U_SENTINEL; } 887 UChar32 c; 888 U8_NEXT_OR_FFFD(s, pos, length, c); 889 return c; 890 } 891 892 const uint8_t *s; 893 int32_t pos; 894 int32_t length; 895 }; 896 897 class FCDUTF8NFDIterator : public NFDIterator { 898 public: 899 FCDUTF8NFDIterator(const CollationData *data, const uint8_t *text, int32_t textLength) 900 : u8ci(data, FALSE, text, 0, textLength) {} 901 protected: 902 virtual UChar32 nextRawCodePoint() { 903 UErrorCode errorCode = U_ZERO_ERROR; 904 return u8ci.nextCodePoint(errorCode); 905 } 906 private: 907 FCDUTF8CollationIterator u8ci; 908 }; 909 910 class UIterNFDIterator : public NFDIterator { 911 public: 912 UIterNFDIterator(UCharIterator &it) : iter(it) {} 913 protected: 914 virtual UChar32 nextRawCodePoint() { 915 return uiter_next32(&iter); 916 } 917 private: 918 UCharIterator &iter; 919 }; 920 921 class FCDUIterNFDIterator : public NFDIterator { 922 public: 923 FCDUIterNFDIterator(const CollationData *data, UCharIterator &it, int32_t startIndex) 924 : uici(data, FALSE, it, startIndex) {} 925 protected: 926 virtual UChar32 nextRawCodePoint() { 927 UErrorCode errorCode = U_ZERO_ERROR; 928 return uici.nextCodePoint(errorCode); 929 } 930 private: 931 FCDUIterCollationIterator uici; 932 }; 933 934 UCollationResult compareNFDIter(const Normalizer2Impl &nfcImpl, 935 NFDIterator &left, NFDIterator &right) { 936 for(;;) { 937 // Fetch the next FCD code point from each string. 938 UChar32 leftCp = left.nextCodePoint(); 939 UChar32 rightCp = right.nextCodePoint(); 940 if(leftCp == rightCp) { 941 if(leftCp < 0) { break; } 942 continue; 943 } 944 // If they are different, then decompose each and compare again. 945 if(leftCp < 0) { 946 leftCp = -2; // end of string 947 } else if(leftCp == 0xfffe) { 948 leftCp = -1; // U+FFFE: merge separator 949 } else { 950 leftCp = left.nextDecomposedCodePoint(nfcImpl, leftCp); 951 } 952 if(rightCp < 0) { 953 rightCp = -2; // end of string 954 } else if(rightCp == 0xfffe) { 955 rightCp = -1; // U+FFFE: merge separator 956 } else { 957 rightCp = right.nextDecomposedCodePoint(nfcImpl, rightCp); 958 } 959 if(leftCp < rightCp) { return UCOL_LESS; } 960 if(leftCp > rightCp) { return UCOL_GREATER; } 961 } 962 return UCOL_EQUAL; 963 } 964 965 } // namespace 966 967 UCollationResult 968 RuleBasedCollator::doCompare(const UChar *left, int32_t leftLength, 969 const UChar *right, int32_t rightLength, 970 UErrorCode &errorCode) const { 971 // U_FAILURE(errorCode) checked by caller. 972 if(left == right && leftLength == rightLength) { 973 return UCOL_EQUAL; 974 } 975 976 // Identical-prefix test. 977 const UChar *leftLimit; 978 const UChar *rightLimit; 979 int32_t equalPrefixLength = 0; 980 if(leftLength < 0) { 981 leftLimit = NULL; 982 rightLimit = NULL; 983 UChar c; 984 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) { 985 if(c == 0) { return UCOL_EQUAL; } 986 ++equalPrefixLength; 987 } 988 } else { 989 leftLimit = left + leftLength; 990 rightLimit = right + rightLength; 991 for(;;) { 992 if(equalPrefixLength == leftLength) { 993 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; } 994 break; 995 } else if(equalPrefixLength == rightLength || 996 left[equalPrefixLength] != right[equalPrefixLength]) { 997 break; 998 } 999 ++equalPrefixLength; 1000 } 1001 } 1002 1003 UBool numeric = settings->isNumeric(); 1004 if(equalPrefixLength > 0) { 1005 if((equalPrefixLength != leftLength && 1006 data->isUnsafeBackward(left[equalPrefixLength], numeric)) || 1007 (equalPrefixLength != rightLength && 1008 data->isUnsafeBackward(right[equalPrefixLength], numeric))) { 1009 // Identical prefix: Back up to the start of a contraction or reordering sequence. 1010 while(--equalPrefixLength > 0 && 1011 data->isUnsafeBackward(left[equalPrefixLength], numeric)) {} 1012 } 1013 // Notes: 1014 // - A longer string can compare equal to a prefix of it if only ignorables follow. 1015 // - With a backward level, a longer string can compare less-than a prefix of it. 1016 1017 // Pass the actual start of each string into the CollationIterators, 1018 // plus the equalPrefixLength position, 1019 // so that prefix matches back into the equal prefix work. 1020 } 1021 1022 int32_t result; 1023 int32_t fastLatinOptions = settings->fastLatinOptions; 1024 if(fastLatinOptions >= 0 && 1025 (equalPrefixLength == leftLength || 1026 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX) && 1027 (equalPrefixLength == rightLength || 1028 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX)) { 1029 if(leftLength >= 0) { 1030 result = CollationFastLatin::compareUTF16(data->fastLatinTable, 1031 settings->fastLatinPrimaries, 1032 fastLatinOptions, 1033 left + equalPrefixLength, 1034 leftLength - equalPrefixLength, 1035 right + equalPrefixLength, 1036 rightLength - equalPrefixLength); 1037 } else { 1038 result = CollationFastLatin::compareUTF16(data->fastLatinTable, 1039 settings->fastLatinPrimaries, 1040 fastLatinOptions, 1041 left + equalPrefixLength, -1, 1042 right + equalPrefixLength, -1); 1043 } 1044 } else { 1045 result = CollationFastLatin::BAIL_OUT_RESULT; 1046 } 1047 1048 if(result == CollationFastLatin::BAIL_OUT_RESULT) { 1049 if(settings->dontCheckFCD()) { 1050 UTF16CollationIterator leftIter(data, numeric, 1051 left, left + equalPrefixLength, leftLimit); 1052 UTF16CollationIterator rightIter(data, numeric, 1053 right, right + equalPrefixLength, rightLimit); 1054 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1055 } else { 1056 FCDUTF16CollationIterator leftIter(data, numeric, 1057 left, left + equalPrefixLength, leftLimit); 1058 FCDUTF16CollationIterator rightIter(data, numeric, 1059 right, right + equalPrefixLength, rightLimit); 1060 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1061 } 1062 } 1063 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) { 1064 return (UCollationResult)result; 1065 } 1066 1067 // Note: If NUL-terminated, we could get the actual limits from the iterators now. 1068 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience, 1069 // and the benefit seems unlikely to be measurable. 1070 1071 // Compare identical level. 1072 const Normalizer2Impl &nfcImpl = data->nfcImpl; 1073 left += equalPrefixLength; 1074 right += equalPrefixLength; 1075 if(settings->dontCheckFCD()) { 1076 UTF16NFDIterator leftIter(left, leftLimit); 1077 UTF16NFDIterator rightIter(right, rightLimit); 1078 return compareNFDIter(nfcImpl, leftIter, rightIter); 1079 } else { 1080 FCDUTF16NFDIterator leftIter(nfcImpl, left, leftLimit); 1081 FCDUTF16NFDIterator rightIter(nfcImpl, right, rightLimit); 1082 return compareNFDIter(nfcImpl, leftIter, rightIter); 1083 } 1084 } 1085 1086 UCollationResult 1087 RuleBasedCollator::doCompare(const uint8_t *left, int32_t leftLength, 1088 const uint8_t *right, int32_t rightLength, 1089 UErrorCode &errorCode) const { 1090 // U_FAILURE(errorCode) checked by caller. 1091 if(left == right && leftLength == rightLength) { 1092 return UCOL_EQUAL; 1093 } 1094 1095 // Identical-prefix test. 1096 int32_t equalPrefixLength = 0; 1097 if(leftLength < 0) { 1098 uint8_t c; 1099 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) { 1100 if(c == 0) { return UCOL_EQUAL; } 1101 ++equalPrefixLength; 1102 } 1103 } else { 1104 for(;;) { 1105 if(equalPrefixLength == leftLength) { 1106 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; } 1107 break; 1108 } else if(equalPrefixLength == rightLength || 1109 left[equalPrefixLength] != right[equalPrefixLength]) { 1110 break; 1111 } 1112 ++equalPrefixLength; 1113 } 1114 } 1115 // Back up to the start of a partially-equal code point. 1116 if(equalPrefixLength > 0 && 1117 ((equalPrefixLength != leftLength && U8_IS_TRAIL(left[equalPrefixLength])) || 1118 (equalPrefixLength != rightLength && U8_IS_TRAIL(right[equalPrefixLength])))) { 1119 while(--equalPrefixLength > 0 && U8_IS_TRAIL(left[equalPrefixLength])) {} 1120 } 1121 1122 UBool numeric = settings->isNumeric(); 1123 if(equalPrefixLength > 0) { 1124 UBool unsafe = FALSE; 1125 if(equalPrefixLength != leftLength) { 1126 int32_t i = equalPrefixLength; 1127 UChar32 c; 1128 U8_NEXT_OR_FFFD(left, i, leftLength, c); 1129 unsafe = data->isUnsafeBackward(c, numeric); 1130 } 1131 if(!unsafe && equalPrefixLength != rightLength) { 1132 int32_t i = equalPrefixLength; 1133 UChar32 c; 1134 U8_NEXT_OR_FFFD(right, i, rightLength, c); 1135 unsafe = data->isUnsafeBackward(c, numeric); 1136 } 1137 if(unsafe) { 1138 // Identical prefix: Back up to the start of a contraction or reordering sequence. 1139 UChar32 c; 1140 do { 1141 U8_PREV_OR_FFFD(left, 0, equalPrefixLength, c); 1142 } while(equalPrefixLength > 0 && data->isUnsafeBackward(c, numeric)); 1143 } 1144 // See the notes in the UTF-16 version. 1145 1146 // Pass the actual start of each string into the CollationIterators, 1147 // plus the equalPrefixLength position, 1148 // so that prefix matches back into the equal prefix work. 1149 } 1150 1151 int32_t result; 1152 int32_t fastLatinOptions = settings->fastLatinOptions; 1153 if(fastLatinOptions >= 0 && 1154 (equalPrefixLength == leftLength || 1155 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD) && 1156 (equalPrefixLength == rightLength || 1157 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD)) { 1158 if(leftLength >= 0) { 1159 result = CollationFastLatin::compareUTF8(data->fastLatinTable, 1160 settings->fastLatinPrimaries, 1161 fastLatinOptions, 1162 left + equalPrefixLength, 1163 leftLength - equalPrefixLength, 1164 right + equalPrefixLength, 1165 rightLength - equalPrefixLength); 1166 } else { 1167 result = CollationFastLatin::compareUTF8(data->fastLatinTable, 1168 settings->fastLatinPrimaries, 1169 fastLatinOptions, 1170 left + equalPrefixLength, -1, 1171 right + equalPrefixLength, -1); 1172 } 1173 } else { 1174 result = CollationFastLatin::BAIL_OUT_RESULT; 1175 } 1176 1177 if(result == CollationFastLatin::BAIL_OUT_RESULT) { 1178 if(settings->dontCheckFCD()) { 1179 UTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength); 1180 UTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength); 1181 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1182 } else { 1183 FCDUTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength); 1184 FCDUTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength); 1185 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1186 } 1187 } 1188 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) { 1189 return (UCollationResult)result; 1190 } 1191 1192 // Note: If NUL-terminated, we could get the actual limits from the iterators now. 1193 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience, 1194 // and the benefit seems unlikely to be measurable. 1195 1196 // Compare identical level. 1197 const Normalizer2Impl &nfcImpl = data->nfcImpl; 1198 left += equalPrefixLength; 1199 right += equalPrefixLength; 1200 if(leftLength > 0) { 1201 leftLength -= equalPrefixLength; 1202 rightLength -= equalPrefixLength; 1203 } 1204 if(settings->dontCheckFCD()) { 1205 UTF8NFDIterator leftIter(left, leftLength); 1206 UTF8NFDIterator rightIter(right, rightLength); 1207 return compareNFDIter(nfcImpl, leftIter, rightIter); 1208 } else { 1209 FCDUTF8NFDIterator leftIter(data, left, leftLength); 1210 FCDUTF8NFDIterator rightIter(data, right, rightLength); 1211 return compareNFDIter(nfcImpl, leftIter, rightIter); 1212 } 1213 } 1214 1215 UCollationResult 1216 RuleBasedCollator::compare(UCharIterator &left, UCharIterator &right, 1217 UErrorCode &errorCode) const { 1218 if(U_FAILURE(errorCode) || &left == &right) { return UCOL_EQUAL; } 1219 UBool numeric = settings->isNumeric(); 1220 1221 // Identical-prefix test. 1222 int32_t equalPrefixLength = 0; 1223 { 1224 UChar32 leftUnit; 1225 UChar32 rightUnit; 1226 while((leftUnit = left.next(&left)) == (rightUnit = right.next(&right))) { 1227 if(leftUnit < 0) { return UCOL_EQUAL; } 1228 ++equalPrefixLength; 1229 } 1230 1231 // Back out the code units that differed, for the real collation comparison. 1232 if(leftUnit >= 0) { left.previous(&left); } 1233 if(rightUnit >= 0) { right.previous(&right); } 1234 1235 if(equalPrefixLength > 0) { 1236 if((leftUnit >= 0 && data->isUnsafeBackward(leftUnit, numeric)) || 1237 (rightUnit >= 0 && data->isUnsafeBackward(rightUnit, numeric))) { 1238 // Identical prefix: Back up to the start of a contraction or reordering sequence. 1239 do { 1240 --equalPrefixLength; 1241 leftUnit = left.previous(&left); 1242 right.previous(&right); 1243 } while(equalPrefixLength > 0 && data->isUnsafeBackward(leftUnit, numeric)); 1244 } 1245 // See the notes in the UTF-16 version. 1246 } 1247 } 1248 1249 UCollationResult result; 1250 if(settings->dontCheckFCD()) { 1251 UIterCollationIterator leftIter(data, numeric, left); 1252 UIterCollationIterator rightIter(data, numeric, right); 1253 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1254 } else { 1255 FCDUIterCollationIterator leftIter(data, numeric, left, equalPrefixLength); 1256 FCDUIterCollationIterator rightIter(data, numeric, right, equalPrefixLength); 1257 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1258 } 1259 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) { 1260 return result; 1261 } 1262 1263 // Compare identical level. 1264 left.move(&left, equalPrefixLength, UITER_ZERO); 1265 right.move(&right, equalPrefixLength, UITER_ZERO); 1266 const Normalizer2Impl &nfcImpl = data->nfcImpl; 1267 if(settings->dontCheckFCD()) { 1268 UIterNFDIterator leftIter(left); 1269 UIterNFDIterator rightIter(right); 1270 return compareNFDIter(nfcImpl, leftIter, rightIter); 1271 } else { 1272 FCDUIterNFDIterator leftIter(data, left, equalPrefixLength); 1273 FCDUIterNFDIterator rightIter(data, right, equalPrefixLength); 1274 return compareNFDIter(nfcImpl, leftIter, rightIter); 1275 } 1276 } 1277 1278 CollationKey & 1279 RuleBasedCollator::getCollationKey(const UnicodeString &s, CollationKey &key, 1280 UErrorCode &errorCode) const { 1281 return getCollationKey(s.getBuffer(), s.length(), key, errorCode); 1282 } 1283 1284 CollationKey & 1285 RuleBasedCollator::getCollationKey(const UChar *s, int32_t length, CollationKey& key, 1286 UErrorCode &errorCode) const { 1287 if(U_FAILURE(errorCode)) { 1288 return key.setToBogus(); 1289 } 1290 if(s == NULL && length != 0) { 1291 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 1292 return key.setToBogus(); 1293 } 1294 key.reset(); // resets the "bogus" state 1295 CollationKeyByteSink sink(key); 1296 writeSortKey(s, length, sink, errorCode); 1297 if(U_FAILURE(errorCode)) { 1298 key.setToBogus(); 1299 } else if(key.isBogus()) { 1300 errorCode = U_MEMORY_ALLOCATION_ERROR; 1301 } else { 1302 key.setLength(sink.NumberOfBytesAppended()); 1303 } 1304 return key; 1305 } 1306 1307 int32_t 1308 RuleBasedCollator::getSortKey(const UnicodeString &s, 1309 uint8_t *dest, int32_t capacity) const { 1310 return getSortKey(s.getBuffer(), s.length(), dest, capacity); 1311 } 1312 1313 int32_t 1314 RuleBasedCollator::getSortKey(const UChar *s, int32_t length, 1315 uint8_t *dest, int32_t capacity) const { 1316 if((s == NULL && length != 0) || capacity < 0 || (dest == NULL && capacity > 0)) { 1317 return 0; 1318 } 1319 uint8_t noDest[1] = { 0 }; 1320 if(dest == NULL) { 1321 // Distinguish pure preflighting from an allocation error. 1322 dest = noDest; 1323 capacity = 0; 1324 } 1325 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), capacity); 1326 UErrorCode errorCode = U_ZERO_ERROR; 1327 writeSortKey(s, length, sink, errorCode); 1328 return U_SUCCESS(errorCode) ? sink.NumberOfBytesAppended() : 0; 1329 } 1330 1331 void 1332 RuleBasedCollator::writeSortKey(const UChar *s, int32_t length, 1333 SortKeyByteSink &sink, UErrorCode &errorCode) const { 1334 if(U_FAILURE(errorCode)) { return; } 1335 const UChar *limit = (length >= 0) ? s + length : NULL; 1336 UBool numeric = settings->isNumeric(); 1337 CollationKeys::LevelCallback callback; 1338 if(settings->dontCheckFCD()) { 1339 UTF16CollationIterator iter(data, numeric, s, s, limit); 1340 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings, 1341 sink, Collation::PRIMARY_LEVEL, 1342 callback, TRUE, errorCode); 1343 } else { 1344 FCDUTF16CollationIterator iter(data, numeric, s, s, limit); 1345 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings, 1346 sink, Collation::PRIMARY_LEVEL, 1347 callback, TRUE, errorCode); 1348 } 1349 if(settings->getStrength() == UCOL_IDENTICAL) { 1350 writeIdenticalLevel(s, limit, sink, errorCode); 1351 } 1352 static const char terminator = 0; // TERMINATOR_BYTE 1353 sink.Append(&terminator, 1); 1354 } 1355 1356 void 1357 RuleBasedCollator::writeIdenticalLevel(const UChar *s, const UChar *limit, 1358 SortKeyByteSink &sink, UErrorCode &errorCode) const { 1359 // NFD quick check 1360 const UChar *nfdQCYesLimit = data->nfcImpl.decompose(s, limit, NULL, errorCode); 1361 if(U_FAILURE(errorCode)) { return; } 1362 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); 1363 UChar32 prev = 0; 1364 if(nfdQCYesLimit != s) { 1365 prev = u_writeIdenticalLevelRun(prev, s, (int32_t)(nfdQCYesLimit - s), sink); 1366 } 1367 // Is there non-NFD text? 1368 int32_t destLengthEstimate; 1369 if(limit != NULL) { 1370 if(nfdQCYesLimit == limit) { return; } 1371 destLengthEstimate = (int32_t)(limit - nfdQCYesLimit); 1372 } else { 1373 // s is NUL-terminated 1374 if(*nfdQCYesLimit == 0) { return; } 1375 destLengthEstimate = -1; 1376 } 1377 UnicodeString nfd; 1378 data->nfcImpl.decompose(nfdQCYesLimit, limit, nfd, destLengthEstimate, errorCode); 1379 u_writeIdenticalLevelRun(prev, nfd.getBuffer(), nfd.length(), sink); 1380 } 1381 1382 namespace { 1383 1384 /** 1385 * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary() 1386 * with an instance of this callback class. 1387 * When another level is about to be written, the callback 1388 * records the level and the number of bytes that will be written until 1389 * the sink (which is actually a FixedSortKeyByteSink) fills up. 1390 * 1391 * When internalNextSortKeyPart() is called again, it restarts with the last level 1392 * and ignores as many bytes as were written previously for that level. 1393 */ 1394 class PartLevelCallback : public CollationKeys::LevelCallback { 1395 public: 1396 PartLevelCallback(const SortKeyByteSink &s) 1397 : sink(s), level(Collation::PRIMARY_LEVEL) { 1398 levelCapacity = sink.GetRemainingCapacity(); 1399 } 1400 virtual ~PartLevelCallback() {} 1401 virtual UBool needToWrite(Collation::Level l) { 1402 if(!sink.Overflowed()) { 1403 // Remember a level that will be at least partially written. 1404 level = l; 1405 levelCapacity = sink.GetRemainingCapacity(); 1406 return TRUE; 1407 } else { 1408 return FALSE; 1409 } 1410 } 1411 Collation::Level getLevel() const { return level; } 1412 int32_t getLevelCapacity() const { return levelCapacity; } 1413 1414 private: 1415 const SortKeyByteSink &sink; 1416 Collation::Level level; 1417 int32_t levelCapacity; 1418 }; 1419 1420 } // namespace 1421 1422 int32_t 1423 RuleBasedCollator::internalNextSortKeyPart(UCharIterator *iter, uint32_t state[2], 1424 uint8_t *dest, int32_t count, UErrorCode &errorCode) const { 1425 if(U_FAILURE(errorCode)) { return 0; } 1426 if(iter == NULL || state == NULL || count < 0 || (count > 0 && dest == NULL)) { 1427 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 1428 return 0; 1429 } 1430 if(count == 0) { return 0; } 1431 1432 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), count); 1433 sink.IgnoreBytes((int32_t)state[1]); 1434 iter->move(iter, 0, UITER_START); 1435 1436 Collation::Level level = (Collation::Level)state[0]; 1437 if(level <= Collation::QUATERNARY_LEVEL) { 1438 UBool numeric = settings->isNumeric(); 1439 PartLevelCallback callback(sink); 1440 if(settings->dontCheckFCD()) { 1441 UIterCollationIterator ci(data, numeric, *iter); 1442 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings, 1443 sink, level, callback, FALSE, errorCode); 1444 } else { 1445 FCDUIterCollationIterator ci(data, numeric, *iter, 0); 1446 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings, 1447 sink, level, callback, FALSE, errorCode); 1448 } 1449 if(U_FAILURE(errorCode)) { return 0; } 1450 if(sink.NumberOfBytesAppended() > count) { 1451 state[0] = (uint32_t)callback.getLevel(); 1452 state[1] = (uint32_t)callback.getLevelCapacity(); 1453 return count; 1454 } 1455 // All of the normal levels are done. 1456 if(settings->getStrength() == UCOL_IDENTICAL) { 1457 level = Collation::IDENTICAL_LEVEL; 1458 iter->move(iter, 0, UITER_START); 1459 } 1460 // else fall through to setting ZERO_LEVEL 1461 } 1462 1463 if(level == Collation::IDENTICAL_LEVEL) { 1464 int32_t levelCapacity = sink.GetRemainingCapacity(); 1465 UnicodeString s; 1466 for(;;) { 1467 UChar32 c = iter->next(iter); 1468 if(c < 0) { break; } 1469 s.append((UChar)c); 1470 } 1471 const UChar *sArray = s.getBuffer(); 1472 writeIdenticalLevel(sArray, sArray + s.length(), sink, errorCode); 1473 if(U_FAILURE(errorCode)) { return 0; } 1474 if(sink.NumberOfBytesAppended() > count) { 1475 state[0] = (uint32_t)level; 1476 state[1] = (uint32_t)levelCapacity; 1477 return count; 1478 } 1479 } 1480 1481 // ZERO_LEVEL: Fill the remainder of dest with 00 bytes. 1482 state[0] = (uint32_t)Collation::ZERO_LEVEL; 1483 state[1] = 0; 1484 int32_t length = sink.NumberOfBytesAppended(); 1485 int32_t i = length; 1486 while(i < count) { dest[i++] = 0; } 1487 return length; 1488 } 1489 1490 void 1491 RuleBasedCollator::internalGetCEs(const UnicodeString &str, UVector64 &ces, 1492 UErrorCode &errorCode) const { 1493 if(U_FAILURE(errorCode)) { return; } 1494 const UChar *s = str.getBuffer(); 1495 const UChar *limit = s + str.length(); 1496 UBool numeric = settings->isNumeric(); 1497 if(settings->dontCheckFCD()) { 1498 UTF16CollationIterator iter(data, numeric, s, s, limit); 1499 int64_t ce; 1500 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) { 1501 ces.addElement(ce, errorCode); 1502 } 1503 } else { 1504 FCDUTF16CollationIterator iter(data, numeric, s, s, limit); 1505 int64_t ce; 1506 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) { 1507 ces.addElement(ce, errorCode); 1508 } 1509 } 1510 } 1511 1512 namespace { 1513 1514 void appendSubtag(CharString &s, char letter, const char *subtag, int32_t length, 1515 UErrorCode &errorCode) { 1516 if(U_FAILURE(errorCode) || length == 0) { return; } 1517 if(!s.isEmpty()) { 1518 s.append('_', errorCode); 1519 } 1520 s.append(letter, errorCode); 1521 for(int32_t i = 0; i < length; ++i) { 1522 s.append(uprv_toupper(subtag[i]), errorCode); 1523 } 1524 } 1525 1526 void appendAttribute(CharString &s, char letter, UColAttributeValue value, 1527 UErrorCode &errorCode) { 1528 if(U_FAILURE(errorCode)) { return; } 1529 if(!s.isEmpty()) { 1530 s.append('_', errorCode); 1531 } 1532 static const char *valueChars = "1234...........IXO..SN..LU......"; 1533 s.append(letter, errorCode); 1534 s.append(valueChars[value], errorCode); 1535 } 1536 1537 } // namespace 1538 1539 int32_t 1540 RuleBasedCollator::internalGetShortDefinitionString(const char *locale, 1541 char *buffer, int32_t capacity, 1542 UErrorCode &errorCode) const { 1543 if(U_FAILURE(errorCode)) { return 0; } 1544 if(buffer == NULL ? capacity != 0 : capacity < 0) { 1545 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 1546 return 0; 1547 } 1548 if(locale == NULL) { 1549 locale = internalGetLocaleID(ULOC_VALID_LOCALE, errorCode); 1550 } 1551 1552 char resultLocale[ULOC_FULLNAME_CAPACITY + 1]; 1553 int32_t length = ucol_getFunctionalEquivalent(resultLocale, ULOC_FULLNAME_CAPACITY, 1554 "collation", locale, 1555 NULL, &errorCode); 1556 if(U_FAILURE(errorCode)) { return 0; } 1557 if(length == 0) { 1558 uprv_strcpy(resultLocale, "root"); 1559 } else { 1560 resultLocale[length] = 0; 1561 } 1562 1563 // Append items in alphabetic order of their short definition letters. 1564 CharString result; 1565 char subtag[ULOC_KEYWORD_AND_VALUES_CAPACITY]; 1566 1567 if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING)) { 1568 appendAttribute(result, 'A', getAttribute(UCOL_ALTERNATE_HANDLING, errorCode), errorCode); 1569 } 1570 // ATTR_VARIABLE_TOP not supported because 'B' was broken. 1571 // See ICU tickets #10372 and #10386. 1572 if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST)) { 1573 appendAttribute(result, 'C', getAttribute(UCOL_CASE_FIRST, errorCode), errorCode); 1574 } 1575 if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION)) { 1576 appendAttribute(result, 'D', getAttribute(UCOL_NUMERIC_COLLATION, errorCode), errorCode); 1577 } 1578 if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL)) { 1579 appendAttribute(result, 'E', getAttribute(UCOL_CASE_LEVEL, errorCode), errorCode); 1580 } 1581 if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION)) { 1582 appendAttribute(result, 'F', getAttribute(UCOL_FRENCH_COLLATION, errorCode), errorCode); 1583 } 1584 // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default. 1585 length = uloc_getKeywordValue(resultLocale, "collation", subtag, UPRV_LENGTHOF(subtag), &errorCode); 1586 appendSubtag(result, 'K', subtag, length, errorCode); 1587 length = uloc_getLanguage(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode); 1588 appendSubtag(result, 'L', subtag, length, errorCode); 1589 if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE)) { 1590 appendAttribute(result, 'N', getAttribute(UCOL_NORMALIZATION_MODE, errorCode), errorCode); 1591 } 1592 length = uloc_getCountry(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode); 1593 appendSubtag(result, 'R', subtag, length, errorCode); 1594 if(attributeHasBeenSetExplicitly(UCOL_STRENGTH)) { 1595 appendAttribute(result, 'S', getAttribute(UCOL_STRENGTH, errorCode), errorCode); 1596 } 1597 length = uloc_getVariant(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode); 1598 appendSubtag(result, 'V', subtag, length, errorCode); 1599 length = uloc_getScript(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode); 1600 appendSubtag(result, 'Z', subtag, length, errorCode); 1601 1602 if(U_FAILURE(errorCode)) { return 0; } 1603 if(result.length() <= capacity) { 1604 uprv_memcpy(buffer, result.data(), result.length()); 1605 } 1606 return u_terminateChars(buffer, capacity, result.length(), &errorCode); 1607 } 1608 1609 UBool 1610 RuleBasedCollator::isUnsafe(UChar32 c) const { 1611 return data->isUnsafeBackward(c, settings->isNumeric()); 1612 } 1613 1614 void U_CALLCONV 1615 RuleBasedCollator::computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode) { 1616 t->maxExpansions = CollationElementIterator::computeMaxExpansions(t->data, errorCode); 1617 } 1618 1619 UBool 1620 RuleBasedCollator::initMaxExpansions(UErrorCode &errorCode) const { 1621 umtx_initOnce(tailoring->maxExpansionsInitOnce, computeMaxExpansions, tailoring, errorCode); 1622 return U_SUCCESS(errorCode); 1623 } 1624 1625 CollationElementIterator * 1626 RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) const { 1627 UErrorCode errorCode = U_ZERO_ERROR; 1628 if(!initMaxExpansions(errorCode)) { return NULL; } 1629 CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode); 1630 if(U_FAILURE(errorCode)) { 1631 delete cei; 1632 return NULL; 1633 } 1634 return cei; 1635 } 1636 1637 CollationElementIterator * 1638 RuleBasedCollator::createCollationElementIterator(const CharacterIterator& source) const { 1639 UErrorCode errorCode = U_ZERO_ERROR; 1640 if(!initMaxExpansions(errorCode)) { return NULL; } 1641 CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode); 1642 if(U_FAILURE(errorCode)) { 1643 delete cei; 1644 return NULL; 1645 } 1646 return cei; 1647 } 1648 1649 int32_t 1650 RuleBasedCollator::getMaxExpansion(int32_t order) const { 1651 UErrorCode errorCode = U_ZERO_ERROR; 1652 (void)initMaxExpansions(errorCode); 1653 return CollationElementIterator::getMaxExpansion(tailoring->maxExpansions, order); 1654 } 1655 1656 U_NAMESPACE_END 1657 1658 #endif // !UCONFIG_NO_COLLATION 1659