1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2015, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * rulebasedcollator.cpp 7 * 8 * (replaced the former tblcoll.cpp) 9 * 10 * created on: 2012feb14 with new and old collation code 11 * created by: Markus W. Scherer 12 */ 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_COLLATION 17 18 #include "unicode/coll.h" 19 #include "unicode/coleitr.h" 20 #include "unicode/localpointer.h" 21 #include "unicode/locid.h" 22 #include "unicode/sortkey.h" 23 #include "unicode/tblcoll.h" 24 #include "unicode/ucol.h" 25 #include "unicode/uiter.h" 26 #include "unicode/uloc.h" 27 #include "unicode/uniset.h" 28 #include "unicode/unistr.h" 29 #include "unicode/usetiter.h" 30 #include "unicode/utf8.h" 31 #include "unicode/uversion.h" 32 #include "bocsu.h" 33 #include "charstr.h" 34 #include "cmemory.h" 35 #include "collation.h" 36 #include "collationcompare.h" 37 #include "collationdata.h" 38 #include "collationdatareader.h" 39 #include "collationfastlatin.h" 40 #include "collationiterator.h" 41 #include "collationkeys.h" 42 #include "collationroot.h" 43 #include "collationsets.h" 44 #include "collationsettings.h" 45 #include "collationtailoring.h" 46 #include "cstring.h" 47 #include "uassert.h" 48 #include "ucol_imp.h" 49 #include "uhash.h" 50 #include "uitercollationiterator.h" 51 #include "ustr_imp.h" 52 #include "utf16collationiterator.h" 53 #include "utf8collationiterator.h" 54 #include "uvectr64.h" 55 56 U_NAMESPACE_BEGIN 57 58 namespace { 59 60 class FixedSortKeyByteSink : public SortKeyByteSink { 61 public: 62 FixedSortKeyByteSink(char *dest, int32_t destCapacity) 63 : SortKeyByteSink(dest, destCapacity) {} 64 virtual ~FixedSortKeyByteSink(); 65 66 private: 67 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length); 68 virtual UBool Resize(int32_t appendCapacity, int32_t length); 69 }; 70 71 FixedSortKeyByteSink::~FixedSortKeyByteSink() {} 72 73 void 74 FixedSortKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t /*n*/, int32_t length) { 75 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ 76 // Fill the buffer completely. 77 int32_t available = capacity_ - length; 78 if (available > 0) { 79 uprv_memcpy(buffer_ + length, bytes, available); 80 } 81 } 82 83 UBool 84 FixedSortKeyByteSink::Resize(int32_t /*appendCapacity*/, int32_t /*length*/) { 85 return FALSE; 86 } 87 88 } // namespace 89 90 // Not in an anonymous namespace, so that it can be a friend of CollationKey. 91 class CollationKeyByteSink : public SortKeyByteSink { 92 public: 93 CollationKeyByteSink(CollationKey &key) 94 : SortKeyByteSink(reinterpret_cast<char *>(key.getBytes()), key.getCapacity()), 95 key_(key) {} 96 virtual ~CollationKeyByteSink(); 97 98 private: 99 virtual void AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length); 100 virtual UBool Resize(int32_t appendCapacity, int32_t length); 101 102 CollationKey &key_; 103 }; 104 105 CollationKeyByteSink::~CollationKeyByteSink() {} 106 107 void 108 CollationKeyByteSink::AppendBeyondCapacity(const char *bytes, int32_t n, int32_t length) { 109 // buffer_ != NULL && bytes != NULL && n > 0 && appended_ > capacity_ 110 if (Resize(n, length)) { 111 uprv_memcpy(buffer_ + length, bytes, n); 112 } 113 } 114 115 UBool 116 CollationKeyByteSink::Resize(int32_t appendCapacity, int32_t length) { 117 if (buffer_ == NULL) { 118 return FALSE; // allocation failed before already 119 } 120 int32_t newCapacity = 2 * capacity_; 121 int32_t altCapacity = length + 2 * appendCapacity; 122 if (newCapacity < altCapacity) { 123 newCapacity = altCapacity; 124 } 125 if (newCapacity < 200) { 126 newCapacity = 200; 127 } 128 uint8_t *newBuffer = key_.reallocate(newCapacity, length); 129 if (newBuffer == NULL) { 130 SetNotOk(); 131 return FALSE; 132 } 133 buffer_ = reinterpret_cast<char *>(newBuffer); 134 capacity_ = newCapacity; 135 return TRUE; 136 } 137 138 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator &other) 139 : Collator(other), 140 data(other.data), 141 settings(other.settings), 142 tailoring(other.tailoring), 143 cacheEntry(other.cacheEntry), 144 validLocale(other.validLocale), 145 explicitlySetAttributes(other.explicitlySetAttributes), 146 actualLocaleIsSameAsValid(other.actualLocaleIsSameAsValid) { 147 settings->addRef(); 148 cacheEntry->addRef(); 149 } 150 151 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length, 152 const RuleBasedCollator *base, UErrorCode &errorCode) 153 : data(NULL), 154 settings(NULL), 155 tailoring(NULL), 156 cacheEntry(NULL), 157 validLocale(""), 158 explicitlySetAttributes(0), 159 actualLocaleIsSameAsValid(FALSE) { 160 if(U_FAILURE(errorCode)) { return; } 161 if(bin == NULL || length == 0 || base == NULL) { 162 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 163 return; 164 } 165 const CollationTailoring *root = CollationRoot::getRoot(errorCode); 166 if(U_FAILURE(errorCode)) { return; } 167 if(base->tailoring != root) { 168 errorCode = U_UNSUPPORTED_ERROR; 169 return; 170 } 171 LocalPointer<CollationTailoring> t(new CollationTailoring(base->tailoring->settings)); 172 if(t.isNull() || t->isBogus()) { 173 errorCode = U_MEMORY_ALLOCATION_ERROR; 174 return; 175 } 176 CollationDataReader::read(base->tailoring, bin, length, *t, errorCode); 177 if(U_FAILURE(errorCode)) { return; } 178 t->actualLocale.setToBogus(); 179 adoptTailoring(t.orphan(), errorCode); 180 } 181 182 RuleBasedCollator::RuleBasedCollator(const CollationCacheEntry *entry) 183 : data(entry->tailoring->data), 184 settings(entry->tailoring->settings), 185 tailoring(entry->tailoring), 186 cacheEntry(entry), 187 validLocale(entry->validLocale), 188 explicitlySetAttributes(0), 189 actualLocaleIsSameAsValid(FALSE) { 190 settings->addRef(); 191 cacheEntry->addRef(); 192 } 193 194 RuleBasedCollator::~RuleBasedCollator() { 195 SharedObject::clearPtr(settings); 196 SharedObject::clearPtr(cacheEntry); 197 } 198 199 void 200 RuleBasedCollator::adoptTailoring(CollationTailoring *t, UErrorCode &errorCode) { 201 if(U_FAILURE(errorCode)) { 202 t->deleteIfZeroRefCount(); 203 return; 204 } 205 U_ASSERT(settings == NULL && data == NULL && tailoring == NULL && cacheEntry == NULL); 206 cacheEntry = new CollationCacheEntry(t->actualLocale, t); 207 if(cacheEntry == NULL) { 208 errorCode = U_MEMORY_ALLOCATION_ERROR; 209 t->deleteIfZeroRefCount(); 210 return; 211 } 212 data = t->data; 213 settings = t->settings; 214 settings->addRef(); 215 tailoring = t; 216 cacheEntry->addRef(); 217 validLocale = t->actualLocale; 218 actualLocaleIsSameAsValid = FALSE; 219 } 220 221 Collator * 222 RuleBasedCollator::clone() const { 223 return new RuleBasedCollator(*this); 224 } 225 226 RuleBasedCollator &RuleBasedCollator::operator=(const RuleBasedCollator &other) { 227 if(this == &other) { return *this; } 228 SharedObject::copyPtr(other.settings, settings); 229 tailoring = other.tailoring; 230 SharedObject::copyPtr(other.cacheEntry, cacheEntry); 231 data = tailoring->data; 232 validLocale = other.validLocale; 233 explicitlySetAttributes = other.explicitlySetAttributes; 234 actualLocaleIsSameAsValid = other.actualLocaleIsSameAsValid; 235 return *this; 236 } 237 238 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator) 239 240 UBool 241 RuleBasedCollator::operator==(const Collator& other) const { 242 if(this == &other) { return TRUE; } 243 if(!Collator::operator==(other)) { return FALSE; } 244 const RuleBasedCollator &o = static_cast<const RuleBasedCollator &>(other); 245 if(*settings != *o.settings) { return FALSE; } 246 if(data == o.data) { return TRUE; } 247 UBool thisIsRoot = data->base == NULL; 248 UBool otherIsRoot = o.data->base == NULL; 249 U_ASSERT(!thisIsRoot || !otherIsRoot); // otherwise their data pointers should be == 250 if(thisIsRoot != otherIsRoot) { return FALSE; } 251 if((thisIsRoot || !tailoring->rules.isEmpty()) && 252 (otherIsRoot || !o.tailoring->rules.isEmpty())) { 253 // Shortcut: If both collators have valid rule strings, then compare those. 254 if(tailoring->rules == o.tailoring->rules) { return TRUE; } 255 } 256 // Different rule strings can result in the same or equivalent tailoring. 257 // The rule strings are optional in ICU resource bundles, although included by default. 258 // cloneBinary() drops the rule string. 259 UErrorCode errorCode = U_ZERO_ERROR; 260 LocalPointer<UnicodeSet> thisTailored(getTailoredSet(errorCode)); 261 LocalPointer<UnicodeSet> otherTailored(o.getTailoredSet(errorCode)); 262 if(U_FAILURE(errorCode)) { return FALSE; } 263 if(*thisTailored != *otherTailored) { return FALSE; } 264 // For completeness, we should compare all of the mappings; 265 // or we should create a list of strings, sort it with one collator, 266 // and check if both collators compare adjacent strings the same 267 // (order & strength, down to quaternary); or similar. 268 // Testing equality of collators seems unusual. 269 return TRUE; 270 } 271 272 int32_t 273 RuleBasedCollator::hashCode() const { 274 int32_t h = settings->hashCode(); 275 if(data->base == NULL) { return h; } // root collator 276 // Do not rely on the rule string, see comments in operator==(). 277 UErrorCode errorCode = U_ZERO_ERROR; 278 LocalPointer<UnicodeSet> set(getTailoredSet(errorCode)); 279 if(U_FAILURE(errorCode)) { return 0; } 280 UnicodeSetIterator iter(*set); 281 while(iter.next() && !iter.isString()) { 282 h ^= data->getCE32(iter.getCodepoint()); 283 } 284 return h; 285 } 286 287 void 288 RuleBasedCollator::setLocales(const Locale &requested, const Locale &valid, 289 const Locale &actual) { 290 if(actual == tailoring->actualLocale) { 291 actualLocaleIsSameAsValid = FALSE; 292 } else { 293 U_ASSERT(actual == valid); 294 actualLocaleIsSameAsValid = TRUE; 295 } 296 // Do not modify tailoring.actualLocale: 297 // We cannot be sure that that would be thread-safe. 298 validLocale = valid; 299 (void)requested; // Ignore, see also ticket #10477. 300 } 301 302 Locale 303 RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode& errorCode) const { 304 if(U_FAILURE(errorCode)) { 305 return Locale::getRoot(); 306 } 307 switch(type) { 308 case ULOC_ACTUAL_LOCALE: 309 return actualLocaleIsSameAsValid ? validLocale : tailoring->actualLocale; 310 case ULOC_VALID_LOCALE: 311 return validLocale; 312 case ULOC_REQUESTED_LOCALE: 313 default: 314 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 315 return Locale::getRoot(); 316 } 317 } 318 319 const char * 320 RuleBasedCollator::internalGetLocaleID(ULocDataLocaleType type, UErrorCode &errorCode) const { 321 if(U_FAILURE(errorCode)) { 322 return NULL; 323 } 324 const Locale *result; 325 switch(type) { 326 case ULOC_ACTUAL_LOCALE: 327 result = actualLocaleIsSameAsValid ? &validLocale : &tailoring->actualLocale; 328 break; 329 case ULOC_VALID_LOCALE: 330 result = &validLocale; 331 break; 332 case ULOC_REQUESTED_LOCALE: 333 default: 334 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 335 return NULL; 336 } 337 if(result->isBogus()) { return NULL; } 338 const char *id = result->getName(); 339 return id[0] == 0 ? "root" : id; 340 } 341 342 const UnicodeString& 343 RuleBasedCollator::getRules() const { 344 return tailoring->rules; 345 } 346 347 void 348 RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) const { 349 if(delta == UCOL_TAILORING_ONLY) { 350 buffer = tailoring->rules; 351 return; 352 } 353 // UCOL_FULL_RULES 354 buffer.remove(); 355 CollationLoader::appendRootRules(buffer); 356 buffer.append(tailoring->rules).getTerminatedBuffer(); 357 } 358 359 void 360 RuleBasedCollator::getVersion(UVersionInfo version) const { 361 uprv_memcpy(version, tailoring->version, U_MAX_VERSION_LENGTH); 362 version[0] += (UCOL_RUNTIME_VERSION << 4) + (UCOL_RUNTIME_VERSION >> 4); 363 } 364 365 UnicodeSet * 366 RuleBasedCollator::getTailoredSet(UErrorCode &errorCode) const { 367 if(U_FAILURE(errorCode)) { return NULL; } 368 UnicodeSet *tailored = new UnicodeSet(); 369 if(tailored == NULL) { 370 errorCode = U_MEMORY_ALLOCATION_ERROR; 371 return NULL; 372 } 373 if(data->base != NULL) { 374 TailoredSet(tailored).forData(data, errorCode); 375 if(U_FAILURE(errorCode)) { 376 delete tailored; 377 return NULL; 378 } 379 } 380 return tailored; 381 } 382 383 void 384 RuleBasedCollator::internalGetContractionsAndExpansions( 385 UnicodeSet *contractions, UnicodeSet *expansions, 386 UBool addPrefixes, UErrorCode &errorCode) const { 387 if(U_FAILURE(errorCode)) { return; } 388 if(contractions != NULL) { 389 contractions->clear(); 390 } 391 if(expansions != NULL) { 392 expansions->clear(); 393 } 394 ContractionsAndExpansions(contractions, expansions, NULL, addPrefixes).forData(data, errorCode); 395 } 396 397 void 398 RuleBasedCollator::internalAddContractions(UChar32 c, UnicodeSet &set, UErrorCode &errorCode) const { 399 if(U_FAILURE(errorCode)) { return; } 400 ContractionsAndExpansions(&set, NULL, NULL, FALSE).forCodePoint(data, c, errorCode); 401 } 402 403 const CollationSettings & 404 RuleBasedCollator::getDefaultSettings() const { 405 return *tailoring->settings; 406 } 407 408 UColAttributeValue 409 RuleBasedCollator::getAttribute(UColAttribute attr, UErrorCode &errorCode) const { 410 if(U_FAILURE(errorCode)) { return UCOL_DEFAULT; } 411 int32_t option; 412 switch(attr) { 413 case UCOL_FRENCH_COLLATION: 414 option = CollationSettings::BACKWARD_SECONDARY; 415 break; 416 case UCOL_ALTERNATE_HANDLING: 417 return settings->getAlternateHandling(); 418 case UCOL_CASE_FIRST: 419 return settings->getCaseFirst(); 420 case UCOL_CASE_LEVEL: 421 option = CollationSettings::CASE_LEVEL; 422 break; 423 case UCOL_NORMALIZATION_MODE: 424 option = CollationSettings::CHECK_FCD; 425 break; 426 case UCOL_STRENGTH: 427 return (UColAttributeValue)settings->getStrength(); 428 case UCOL_HIRAGANA_QUATERNARY_MODE: 429 // Deprecated attribute, unsettable. 430 return UCOL_OFF; 431 case UCOL_NUMERIC_COLLATION: 432 option = CollationSettings::NUMERIC; 433 break; 434 default: 435 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 436 return UCOL_DEFAULT; 437 } 438 return ((settings->options & option) == 0) ? UCOL_OFF : UCOL_ON; 439 } 440 441 void 442 RuleBasedCollator::setAttribute(UColAttribute attr, UColAttributeValue value, 443 UErrorCode &errorCode) { 444 UColAttributeValue oldValue = getAttribute(attr, errorCode); 445 if(U_FAILURE(errorCode)) { return; } 446 if(value == oldValue) { 447 setAttributeExplicitly(attr); 448 return; 449 } 450 const CollationSettings &defaultSettings = getDefaultSettings(); 451 if(settings == &defaultSettings) { 452 if(value == UCOL_DEFAULT) { 453 setAttributeDefault(attr); 454 return; 455 } 456 } 457 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 458 if(ownedSettings == NULL) { 459 errorCode = U_MEMORY_ALLOCATION_ERROR; 460 return; 461 } 462 463 switch(attr) { 464 case UCOL_FRENCH_COLLATION: 465 ownedSettings->setFlag(CollationSettings::BACKWARD_SECONDARY, value, 466 defaultSettings.options, errorCode); 467 break; 468 case UCOL_ALTERNATE_HANDLING: 469 ownedSettings->setAlternateHandling(value, defaultSettings.options, errorCode); 470 break; 471 case UCOL_CASE_FIRST: 472 ownedSettings->setCaseFirst(value, defaultSettings.options, errorCode); 473 break; 474 case UCOL_CASE_LEVEL: 475 ownedSettings->setFlag(CollationSettings::CASE_LEVEL, value, 476 defaultSettings.options, errorCode); 477 break; 478 case UCOL_NORMALIZATION_MODE: 479 ownedSettings->setFlag(CollationSettings::CHECK_FCD, value, 480 defaultSettings.options, errorCode); 481 break; 482 case UCOL_STRENGTH: 483 ownedSettings->setStrength(value, defaultSettings.options, errorCode); 484 break; 485 case UCOL_HIRAGANA_QUATERNARY_MODE: 486 // Deprecated attribute. Check for valid values but do not change anything. 487 if(value != UCOL_OFF && value != UCOL_ON && value != UCOL_DEFAULT) { 488 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 489 } 490 break; 491 case UCOL_NUMERIC_COLLATION: 492 ownedSettings->setFlag(CollationSettings::NUMERIC, value, defaultSettings.options, errorCode); 493 break; 494 default: 495 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 496 break; 497 } 498 if(U_FAILURE(errorCode)) { return; } 499 setFastLatinOptions(*ownedSettings); 500 if(value == UCOL_DEFAULT) { 501 setAttributeDefault(attr); 502 } else { 503 setAttributeExplicitly(attr); 504 } 505 } 506 507 Collator & 508 RuleBasedCollator::setMaxVariable(UColReorderCode group, UErrorCode &errorCode) { 509 if(U_FAILURE(errorCode)) { return *this; } 510 // Convert the reorder code into a MaxVariable number, or UCOL_DEFAULT=-1. 511 int32_t value; 512 if(group == UCOL_REORDER_CODE_DEFAULT) { 513 value = UCOL_DEFAULT; 514 } else if(UCOL_REORDER_CODE_FIRST <= group && group <= UCOL_REORDER_CODE_CURRENCY) { 515 value = group - UCOL_REORDER_CODE_FIRST; 516 } else { 517 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 518 return *this; 519 } 520 CollationSettings::MaxVariable oldValue = settings->getMaxVariable(); 521 if(value == oldValue) { 522 setAttributeExplicitly(ATTR_VARIABLE_TOP); 523 return *this; 524 } 525 const CollationSettings &defaultSettings = getDefaultSettings(); 526 if(settings == &defaultSettings) { 527 if(value == UCOL_DEFAULT) { 528 setAttributeDefault(ATTR_VARIABLE_TOP); 529 return *this; 530 } 531 } 532 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 533 if(ownedSettings == NULL) { 534 errorCode = U_MEMORY_ALLOCATION_ERROR; 535 return *this; 536 } 537 538 if(group == UCOL_REORDER_CODE_DEFAULT) { 539 group = (UColReorderCode)(UCOL_REORDER_CODE_FIRST + defaultSettings.getMaxVariable()); 540 } 541 uint32_t varTop = data->getLastPrimaryForGroup(group); 542 U_ASSERT(varTop != 0); 543 ownedSettings->setMaxVariable(value, defaultSettings.options, errorCode); 544 if(U_FAILURE(errorCode)) { return *this; } 545 ownedSettings->variableTop = varTop; 546 setFastLatinOptions(*ownedSettings); 547 if(value == UCOL_DEFAULT) { 548 setAttributeDefault(ATTR_VARIABLE_TOP); 549 } else { 550 setAttributeExplicitly(ATTR_VARIABLE_TOP); 551 } 552 return *this; 553 } 554 555 UColReorderCode 556 RuleBasedCollator::getMaxVariable() const { 557 return (UColReorderCode)(UCOL_REORDER_CODE_FIRST + settings->getMaxVariable()); 558 } 559 560 uint32_t 561 RuleBasedCollator::getVariableTop(UErrorCode & /*errorCode*/) const { 562 return settings->variableTop; 563 } 564 565 uint32_t 566 RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &errorCode) { 567 if(U_FAILURE(errorCode)) { return 0; } 568 if(varTop == NULL && len !=0) { 569 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 570 return 0; 571 } 572 if(len < 0) { len = u_strlen(varTop); } 573 if(len == 0) { 574 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 575 return 0; 576 } 577 UBool numeric = settings->isNumeric(); 578 int64_t ce1, ce2; 579 if(settings->dontCheckFCD()) { 580 UTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len); 581 ce1 = ci.nextCE(errorCode); 582 ce2 = ci.nextCE(errorCode); 583 } else { 584 FCDUTF16CollationIterator ci(data, numeric, varTop, varTop, varTop + len); 585 ce1 = ci.nextCE(errorCode); 586 ce2 = ci.nextCE(errorCode); 587 } 588 if(ce1 == Collation::NO_CE || ce2 != Collation::NO_CE) { 589 errorCode = U_CE_NOT_FOUND_ERROR; 590 return 0; 591 } 592 setVariableTop((uint32_t)(ce1 >> 32), errorCode); 593 return settings->variableTop; 594 } 595 596 uint32_t 597 RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &errorCode) { 598 return setVariableTop(varTop.getBuffer(), varTop.length(), errorCode); 599 } 600 601 void 602 RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &errorCode) { 603 if(U_FAILURE(errorCode)) { return; } 604 if(varTop != settings->variableTop) { 605 // Pin the variable top to the end of the reordering group which contains it. 606 // Only a few special groups are supported. 607 int32_t group = data->getGroupForPrimary(varTop); 608 if(group < UCOL_REORDER_CODE_FIRST || UCOL_REORDER_CODE_CURRENCY < group) { 609 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 610 return; 611 } 612 uint32_t v = data->getLastPrimaryForGroup(group); 613 U_ASSERT(v != 0 && v >= varTop); 614 varTop = v; 615 if(varTop != settings->variableTop) { 616 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 617 if(ownedSettings == NULL) { 618 errorCode = U_MEMORY_ALLOCATION_ERROR; 619 return; 620 } 621 ownedSettings->setMaxVariable(group - UCOL_REORDER_CODE_FIRST, 622 getDefaultSettings().options, errorCode); 623 if(U_FAILURE(errorCode)) { return; } 624 ownedSettings->variableTop = varTop; 625 setFastLatinOptions(*ownedSettings); 626 } 627 } 628 if(varTop == getDefaultSettings().variableTop) { 629 setAttributeDefault(ATTR_VARIABLE_TOP); 630 } else { 631 setAttributeExplicitly(ATTR_VARIABLE_TOP); 632 } 633 } 634 635 int32_t 636 RuleBasedCollator::getReorderCodes(int32_t *dest, int32_t capacity, 637 UErrorCode &errorCode) const { 638 if(U_FAILURE(errorCode)) { return 0; } 639 if(capacity < 0 || (dest == NULL && capacity > 0)) { 640 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 641 return 0; 642 } 643 int32_t length = settings->reorderCodesLength; 644 if(length == 0) { return 0; } 645 if(length > capacity) { 646 errorCode = U_BUFFER_OVERFLOW_ERROR; 647 return length; 648 } 649 uprv_memcpy(dest, settings->reorderCodes, length * 4); 650 return length; 651 } 652 653 void 654 RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, int32_t length, 655 UErrorCode &errorCode) { 656 if(U_FAILURE(errorCode)) { return; } 657 if(length < 0 || (reorderCodes == NULL && length > 0)) { 658 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 659 return; 660 } 661 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_NONE) { 662 length = 0; 663 } 664 if(length == settings->reorderCodesLength && 665 uprv_memcmp(reorderCodes, settings->reorderCodes, length * 4) == 0) { 666 return; 667 } 668 const CollationSettings &defaultSettings = getDefaultSettings(); 669 if(length == 1 && reorderCodes[0] == UCOL_REORDER_CODE_DEFAULT) { 670 if(settings != &defaultSettings) { 671 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 672 if(ownedSettings == NULL) { 673 errorCode = U_MEMORY_ALLOCATION_ERROR; 674 return; 675 } 676 ownedSettings->copyReorderingFrom(defaultSettings, errorCode); 677 setFastLatinOptions(*ownedSettings); 678 } 679 return; 680 } 681 CollationSettings *ownedSettings = SharedObject::copyOnWrite(settings); 682 if(ownedSettings == NULL) { 683 errorCode = U_MEMORY_ALLOCATION_ERROR; 684 return; 685 } 686 ownedSettings->setReordering(*data, reorderCodes, length, errorCode); 687 setFastLatinOptions(*ownedSettings); 688 } 689 690 void 691 RuleBasedCollator::setFastLatinOptions(CollationSettings &ownedSettings) const { 692 ownedSettings.fastLatinOptions = CollationFastLatin::getOptions( 693 data, ownedSettings, 694 ownedSettings.fastLatinPrimaries, UPRV_LENGTHOF(ownedSettings.fastLatinPrimaries)); 695 } 696 697 UCollationResult 698 RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right, 699 UErrorCode &errorCode) const { 700 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 701 return doCompare(left.getBuffer(), left.length(), 702 right.getBuffer(), right.length(), errorCode); 703 } 704 705 UCollationResult 706 RuleBasedCollator::compare(const UnicodeString &left, const UnicodeString &right, 707 int32_t length, UErrorCode &errorCode) const { 708 if(U_FAILURE(errorCode) || length == 0) { return UCOL_EQUAL; } 709 if(length < 0) { 710 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 711 return UCOL_EQUAL; 712 } 713 int32_t leftLength = left.length(); 714 int32_t rightLength = right.length(); 715 if(leftLength > length) { leftLength = length; } 716 if(rightLength > length) { rightLength = length; } 717 return doCompare(left.getBuffer(), leftLength, 718 right.getBuffer(), rightLength, errorCode); 719 } 720 721 UCollationResult 722 RuleBasedCollator::compare(const UChar *left, int32_t leftLength, 723 const UChar *right, int32_t rightLength, 724 UErrorCode &errorCode) const { 725 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 726 if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) { 727 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 728 return UCOL_EQUAL; 729 } 730 // Make sure both or neither strings have a known length. 731 // We do not optimize for mixed length/termination. 732 if(leftLength >= 0) { 733 if(rightLength < 0) { rightLength = u_strlen(right); } 734 } else { 735 if(rightLength >= 0) { leftLength = u_strlen(left); } 736 } 737 return doCompare(left, leftLength, right, rightLength, errorCode); 738 } 739 740 UCollationResult 741 RuleBasedCollator::compareUTF8(const StringPiece &left, const StringPiece &right, 742 UErrorCode &errorCode) const { 743 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 744 const uint8_t *leftBytes = reinterpret_cast<const uint8_t *>(left.data()); 745 const uint8_t *rightBytes = reinterpret_cast<const uint8_t *>(right.data()); 746 if((leftBytes == NULL && !left.empty()) || (rightBytes == NULL && !right.empty())) { 747 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 748 return UCOL_EQUAL; 749 } 750 return doCompare(leftBytes, left.length(), rightBytes, right.length(), errorCode); 751 } 752 753 UCollationResult 754 RuleBasedCollator::internalCompareUTF8(const char *left, int32_t leftLength, 755 const char *right, int32_t rightLength, 756 UErrorCode &errorCode) const { 757 if(U_FAILURE(errorCode)) { return UCOL_EQUAL; } 758 if((left == NULL && leftLength != 0) || (right == NULL && rightLength != 0)) { 759 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 760 return UCOL_EQUAL; 761 } 762 // Make sure both or neither strings have a known length. 763 // We do not optimize for mixed length/termination. 764 if(leftLength >= 0) { 765 if(rightLength < 0) { rightLength = uprv_strlen(right); } 766 } else { 767 if(rightLength >= 0) { leftLength = uprv_strlen(left); } 768 } 769 return doCompare(reinterpret_cast<const uint8_t *>(left), leftLength, 770 reinterpret_cast<const uint8_t *>(right), rightLength, errorCode); 771 } 772 773 namespace { 774 775 /** 776 * Abstract iterator for identical-level string comparisons. 777 * Returns FCD code points and handles temporary switching to NFD. 778 */ 779 class NFDIterator : public UObject { 780 public: 781 NFDIterator() : index(-1), length(0) {} 782 virtual ~NFDIterator() {} 783 /** 784 * Returns the next code point from the internal normalization buffer, 785 * or else the next text code point. 786 * Returns -1 at the end of the text. 787 */ 788 UChar32 nextCodePoint() { 789 if(index >= 0) { 790 if(index == length) { 791 index = -1; 792 } else { 793 UChar32 c; 794 U16_NEXT_UNSAFE(decomp, index, c); 795 return c; 796 } 797 } 798 return nextRawCodePoint(); 799 } 800 /** 801 * @param nfcImpl 802 * @param c the last code point returned by nextCodePoint() or nextDecomposedCodePoint() 803 * @return the first code point in c's decomposition, 804 * or c itself if it was decomposed already or if it does not decompose 805 */ 806 UChar32 nextDecomposedCodePoint(const Normalizer2Impl &nfcImpl, UChar32 c) { 807 if(index >= 0) { return c; } 808 decomp = nfcImpl.getDecomposition(c, buffer, length); 809 if(decomp == NULL) { return c; } 810 index = 0; 811 U16_NEXT_UNSAFE(decomp, index, c); 812 return c; 813 } 814 protected: 815 /** 816 * Returns the next text code point in FCD order. 817 * Returns -1 at the end of the text. 818 */ 819 virtual UChar32 nextRawCodePoint() = 0; 820 private: 821 const UChar *decomp; 822 UChar buffer[4]; 823 int32_t index; 824 int32_t length; 825 }; 826 827 class UTF16NFDIterator : public NFDIterator { 828 public: 829 UTF16NFDIterator(const UChar *text, const UChar *textLimit) : s(text), limit(textLimit) {} 830 protected: 831 virtual UChar32 nextRawCodePoint() { 832 if(s == limit) { return U_SENTINEL; } 833 UChar32 c = *s++; 834 if(limit == NULL && c == 0) { 835 s = NULL; 836 return U_SENTINEL; 837 } 838 UChar trail; 839 if(U16_IS_LEAD(c) && s != limit && U16_IS_TRAIL(trail = *s)) { 840 ++s; 841 c = U16_GET_SUPPLEMENTARY(c, trail); 842 } 843 return c; 844 } 845 846 const UChar *s; 847 const UChar *limit; 848 }; 849 850 class FCDUTF16NFDIterator : public UTF16NFDIterator { 851 public: 852 FCDUTF16NFDIterator(const Normalizer2Impl &nfcImpl, const UChar *text, const UChar *textLimit) 853 : UTF16NFDIterator(NULL, NULL) { 854 UErrorCode errorCode = U_ZERO_ERROR; 855 const UChar *spanLimit = nfcImpl.makeFCD(text, textLimit, NULL, errorCode); 856 if(U_FAILURE(errorCode)) { return; } 857 if(spanLimit == textLimit || (textLimit == NULL && *spanLimit == 0)) { 858 s = text; 859 limit = spanLimit; 860 } else { 861 str.setTo(text, (int32_t)(spanLimit - text)); 862 { 863 ReorderingBuffer buffer(nfcImpl, str); 864 if(buffer.init(str.length(), errorCode)) { 865 nfcImpl.makeFCD(spanLimit, textLimit, &buffer, errorCode); 866 } 867 } 868 if(U_SUCCESS(errorCode)) { 869 s = str.getBuffer(); 870 limit = s + str.length(); 871 } 872 } 873 } 874 private: 875 UnicodeString str; 876 }; 877 878 class UTF8NFDIterator : public NFDIterator { 879 public: 880 UTF8NFDIterator(const uint8_t *text, int32_t textLength) 881 : s(text), pos(0), length(textLength) {} 882 protected: 883 virtual UChar32 nextRawCodePoint() { 884 if(pos == length || (s[pos] == 0 && length < 0)) { return U_SENTINEL; } 885 UChar32 c; 886 U8_NEXT_OR_FFFD(s, pos, length, c); 887 return c; 888 } 889 890 const uint8_t *s; 891 int32_t pos; 892 int32_t length; 893 }; 894 895 class FCDUTF8NFDIterator : public NFDIterator { 896 public: 897 FCDUTF8NFDIterator(const CollationData *data, const uint8_t *text, int32_t textLength) 898 : u8ci(data, FALSE, text, 0, textLength) {} 899 protected: 900 virtual UChar32 nextRawCodePoint() { 901 UErrorCode errorCode = U_ZERO_ERROR; 902 return u8ci.nextCodePoint(errorCode); 903 } 904 private: 905 FCDUTF8CollationIterator u8ci; 906 }; 907 908 class UIterNFDIterator : public NFDIterator { 909 public: 910 UIterNFDIterator(UCharIterator &it) : iter(it) {} 911 protected: 912 virtual UChar32 nextRawCodePoint() { 913 return uiter_next32(&iter); 914 } 915 private: 916 UCharIterator &iter; 917 }; 918 919 class FCDUIterNFDIterator : public NFDIterator { 920 public: 921 FCDUIterNFDIterator(const CollationData *data, UCharIterator &it, int32_t startIndex) 922 : uici(data, FALSE, it, startIndex) {} 923 protected: 924 virtual UChar32 nextRawCodePoint() { 925 UErrorCode errorCode = U_ZERO_ERROR; 926 return uici.nextCodePoint(errorCode); 927 } 928 private: 929 FCDUIterCollationIterator uici; 930 }; 931 932 UCollationResult compareNFDIter(const Normalizer2Impl &nfcImpl, 933 NFDIterator &left, NFDIterator &right) { 934 for(;;) { 935 // Fetch the next FCD code point from each string. 936 UChar32 leftCp = left.nextCodePoint(); 937 UChar32 rightCp = right.nextCodePoint(); 938 if(leftCp == rightCp) { 939 if(leftCp < 0) { break; } 940 continue; 941 } 942 // If they are different, then decompose each and compare again. 943 if(leftCp < 0) { 944 leftCp = -2; // end of string 945 } else if(leftCp == 0xfffe) { 946 leftCp = -1; // U+FFFE: merge separator 947 } else { 948 leftCp = left.nextDecomposedCodePoint(nfcImpl, leftCp); 949 } 950 if(rightCp < 0) { 951 rightCp = -2; // end of string 952 } else if(rightCp == 0xfffe) { 953 rightCp = -1; // U+FFFE: merge separator 954 } else { 955 rightCp = right.nextDecomposedCodePoint(nfcImpl, rightCp); 956 } 957 if(leftCp < rightCp) { return UCOL_LESS; } 958 if(leftCp > rightCp) { return UCOL_GREATER; } 959 } 960 return UCOL_EQUAL; 961 } 962 963 } // namespace 964 965 UCollationResult 966 RuleBasedCollator::doCompare(const UChar *left, int32_t leftLength, 967 const UChar *right, int32_t rightLength, 968 UErrorCode &errorCode) const { 969 // U_FAILURE(errorCode) checked by caller. 970 if(left == right && leftLength == rightLength) { 971 return UCOL_EQUAL; 972 } 973 974 // Identical-prefix test. 975 const UChar *leftLimit; 976 const UChar *rightLimit; 977 int32_t equalPrefixLength = 0; 978 if(leftLength < 0) { 979 leftLimit = NULL; 980 rightLimit = NULL; 981 UChar c; 982 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) { 983 if(c == 0) { return UCOL_EQUAL; } 984 ++equalPrefixLength; 985 } 986 } else { 987 leftLimit = left + leftLength; 988 rightLimit = right + rightLength; 989 for(;;) { 990 if(equalPrefixLength == leftLength) { 991 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; } 992 break; 993 } else if(equalPrefixLength == rightLength || 994 left[equalPrefixLength] != right[equalPrefixLength]) { 995 break; 996 } 997 ++equalPrefixLength; 998 } 999 } 1000 1001 UBool numeric = settings->isNumeric(); 1002 if(equalPrefixLength > 0) { 1003 if((equalPrefixLength != leftLength && 1004 data->isUnsafeBackward(left[equalPrefixLength], numeric)) || 1005 (equalPrefixLength != rightLength && 1006 data->isUnsafeBackward(right[equalPrefixLength], numeric))) { 1007 // Identical prefix: Back up to the start of a contraction or reordering sequence. 1008 while(--equalPrefixLength > 0 && 1009 data->isUnsafeBackward(left[equalPrefixLength], numeric)) {} 1010 } 1011 // Notes: 1012 // - A longer string can compare equal to a prefix of it if only ignorables follow. 1013 // - With a backward level, a longer string can compare less-than a prefix of it. 1014 1015 // Pass the actual start of each string into the CollationIterators, 1016 // plus the equalPrefixLength position, 1017 // so that prefix matches back into the equal prefix work. 1018 } 1019 1020 int32_t result; 1021 int32_t fastLatinOptions = settings->fastLatinOptions; 1022 if(fastLatinOptions >= 0 && 1023 (equalPrefixLength == leftLength || 1024 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX) && 1025 (equalPrefixLength == rightLength || 1026 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX)) { 1027 if(leftLength >= 0) { 1028 result = CollationFastLatin::compareUTF16(data->fastLatinTable, 1029 settings->fastLatinPrimaries, 1030 fastLatinOptions, 1031 left + equalPrefixLength, 1032 leftLength - equalPrefixLength, 1033 right + equalPrefixLength, 1034 rightLength - equalPrefixLength); 1035 } else { 1036 result = CollationFastLatin::compareUTF16(data->fastLatinTable, 1037 settings->fastLatinPrimaries, 1038 fastLatinOptions, 1039 left + equalPrefixLength, -1, 1040 right + equalPrefixLength, -1); 1041 } 1042 } else { 1043 result = CollationFastLatin::BAIL_OUT_RESULT; 1044 } 1045 1046 if(result == CollationFastLatin::BAIL_OUT_RESULT) { 1047 if(settings->dontCheckFCD()) { 1048 UTF16CollationIterator leftIter(data, numeric, 1049 left, left + equalPrefixLength, leftLimit); 1050 UTF16CollationIterator rightIter(data, numeric, 1051 right, right + equalPrefixLength, rightLimit); 1052 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1053 } else { 1054 FCDUTF16CollationIterator leftIter(data, numeric, 1055 left, left + equalPrefixLength, leftLimit); 1056 FCDUTF16CollationIterator rightIter(data, numeric, 1057 right, right + equalPrefixLength, rightLimit); 1058 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1059 } 1060 } 1061 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) { 1062 return (UCollationResult)result; 1063 } 1064 1065 // Note: If NUL-terminated, we could get the actual limits from the iterators now. 1066 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience, 1067 // and the benefit seems unlikely to be measurable. 1068 1069 // Compare identical level. 1070 const Normalizer2Impl &nfcImpl = data->nfcImpl; 1071 left += equalPrefixLength; 1072 right += equalPrefixLength; 1073 if(settings->dontCheckFCD()) { 1074 UTF16NFDIterator leftIter(left, leftLimit); 1075 UTF16NFDIterator rightIter(right, rightLimit); 1076 return compareNFDIter(nfcImpl, leftIter, rightIter); 1077 } else { 1078 FCDUTF16NFDIterator leftIter(nfcImpl, left, leftLimit); 1079 FCDUTF16NFDIterator rightIter(nfcImpl, right, rightLimit); 1080 return compareNFDIter(nfcImpl, leftIter, rightIter); 1081 } 1082 } 1083 1084 UCollationResult 1085 RuleBasedCollator::doCompare(const uint8_t *left, int32_t leftLength, 1086 const uint8_t *right, int32_t rightLength, 1087 UErrorCode &errorCode) const { 1088 // U_FAILURE(errorCode) checked by caller. 1089 if(left == right && leftLength == rightLength) { 1090 return UCOL_EQUAL; 1091 } 1092 1093 // Identical-prefix test. 1094 int32_t equalPrefixLength = 0; 1095 if(leftLength < 0) { 1096 uint8_t c; 1097 while((c = left[equalPrefixLength]) == right[equalPrefixLength]) { 1098 if(c == 0) { return UCOL_EQUAL; } 1099 ++equalPrefixLength; 1100 } 1101 } else { 1102 for(;;) { 1103 if(equalPrefixLength == leftLength) { 1104 if(equalPrefixLength == rightLength) { return UCOL_EQUAL; } 1105 break; 1106 } else if(equalPrefixLength == rightLength || 1107 left[equalPrefixLength] != right[equalPrefixLength]) { 1108 break; 1109 } 1110 ++equalPrefixLength; 1111 } 1112 } 1113 // Back up to the start of a partially-equal code point. 1114 if(equalPrefixLength > 0 && 1115 ((equalPrefixLength != leftLength && U8_IS_TRAIL(left[equalPrefixLength])) || 1116 (equalPrefixLength != rightLength && U8_IS_TRAIL(right[equalPrefixLength])))) { 1117 while(--equalPrefixLength > 0 && U8_IS_TRAIL(left[equalPrefixLength])) {} 1118 } 1119 1120 UBool numeric = settings->isNumeric(); 1121 if(equalPrefixLength > 0) { 1122 UBool unsafe = FALSE; 1123 if(equalPrefixLength != leftLength) { 1124 int32_t i = equalPrefixLength; 1125 UChar32 c; 1126 U8_NEXT_OR_FFFD(left, i, leftLength, c); 1127 unsafe = data->isUnsafeBackward(c, numeric); 1128 } 1129 if(!unsafe && equalPrefixLength != rightLength) { 1130 int32_t i = equalPrefixLength; 1131 UChar32 c; 1132 U8_NEXT_OR_FFFD(right, i, rightLength, c); 1133 unsafe = data->isUnsafeBackward(c, numeric); 1134 } 1135 if(unsafe) { 1136 // Identical prefix: Back up to the start of a contraction or reordering sequence. 1137 UChar32 c; 1138 do { 1139 U8_PREV_OR_FFFD(left, 0, equalPrefixLength, c); 1140 } while(equalPrefixLength > 0 && data->isUnsafeBackward(c, numeric)); 1141 } 1142 // See the notes in the UTF-16 version. 1143 1144 // Pass the actual start of each string into the CollationIterators, 1145 // plus the equalPrefixLength position, 1146 // so that prefix matches back into the equal prefix work. 1147 } 1148 1149 int32_t result; 1150 int32_t fastLatinOptions = settings->fastLatinOptions; 1151 if(fastLatinOptions >= 0 && 1152 (equalPrefixLength == leftLength || 1153 left[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD) && 1154 (equalPrefixLength == rightLength || 1155 right[equalPrefixLength] <= CollationFastLatin::LATIN_MAX_UTF8_LEAD)) { 1156 if(leftLength >= 0) { 1157 result = CollationFastLatin::compareUTF8(data->fastLatinTable, 1158 settings->fastLatinPrimaries, 1159 fastLatinOptions, 1160 left + equalPrefixLength, 1161 leftLength - equalPrefixLength, 1162 right + equalPrefixLength, 1163 rightLength - equalPrefixLength); 1164 } else { 1165 result = CollationFastLatin::compareUTF8(data->fastLatinTable, 1166 settings->fastLatinPrimaries, 1167 fastLatinOptions, 1168 left + equalPrefixLength, -1, 1169 right + equalPrefixLength, -1); 1170 } 1171 } else { 1172 result = CollationFastLatin::BAIL_OUT_RESULT; 1173 } 1174 1175 if(result == CollationFastLatin::BAIL_OUT_RESULT) { 1176 if(settings->dontCheckFCD()) { 1177 UTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength); 1178 UTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength); 1179 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1180 } else { 1181 FCDUTF8CollationIterator leftIter(data, numeric, left, equalPrefixLength, leftLength); 1182 FCDUTF8CollationIterator rightIter(data, numeric, right, equalPrefixLength, rightLength); 1183 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1184 } 1185 } 1186 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) { 1187 return (UCollationResult)result; 1188 } 1189 1190 // Note: If NUL-terminated, we could get the actual limits from the iterators now. 1191 // That would complicate the iterators a bit, NUL-terminated strings are only a C convenience, 1192 // and the benefit seems unlikely to be measurable. 1193 1194 // Compare identical level. 1195 const Normalizer2Impl &nfcImpl = data->nfcImpl; 1196 left += equalPrefixLength; 1197 right += equalPrefixLength; 1198 if(leftLength > 0) { 1199 leftLength -= equalPrefixLength; 1200 rightLength -= equalPrefixLength; 1201 } 1202 if(settings->dontCheckFCD()) { 1203 UTF8NFDIterator leftIter(left, leftLength); 1204 UTF8NFDIterator rightIter(right, rightLength); 1205 return compareNFDIter(nfcImpl, leftIter, rightIter); 1206 } else { 1207 FCDUTF8NFDIterator leftIter(data, left, leftLength); 1208 FCDUTF8NFDIterator rightIter(data, right, rightLength); 1209 return compareNFDIter(nfcImpl, leftIter, rightIter); 1210 } 1211 } 1212 1213 UCollationResult 1214 RuleBasedCollator::compare(UCharIterator &left, UCharIterator &right, 1215 UErrorCode &errorCode) const { 1216 if(U_FAILURE(errorCode) || &left == &right) { return UCOL_EQUAL; } 1217 UBool numeric = settings->isNumeric(); 1218 1219 // Identical-prefix test. 1220 int32_t equalPrefixLength = 0; 1221 { 1222 UChar32 leftUnit; 1223 UChar32 rightUnit; 1224 while((leftUnit = left.next(&left)) == (rightUnit = right.next(&right))) { 1225 if(leftUnit < 0) { return UCOL_EQUAL; } 1226 ++equalPrefixLength; 1227 } 1228 1229 // Back out the code units that differed, for the real collation comparison. 1230 if(leftUnit >= 0) { left.previous(&left); } 1231 if(rightUnit >= 0) { right.previous(&right); } 1232 1233 if(equalPrefixLength > 0) { 1234 if((leftUnit >= 0 && data->isUnsafeBackward(leftUnit, numeric)) || 1235 (rightUnit >= 0 && data->isUnsafeBackward(rightUnit, numeric))) { 1236 // Identical prefix: Back up to the start of a contraction or reordering sequence. 1237 do { 1238 --equalPrefixLength; 1239 leftUnit = left.previous(&left); 1240 right.previous(&right); 1241 } while(equalPrefixLength > 0 && data->isUnsafeBackward(leftUnit, numeric)); 1242 } 1243 // See the notes in the UTF-16 version. 1244 } 1245 } 1246 1247 UCollationResult result; 1248 if(settings->dontCheckFCD()) { 1249 UIterCollationIterator leftIter(data, numeric, left); 1250 UIterCollationIterator rightIter(data, numeric, right); 1251 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1252 } else { 1253 FCDUIterCollationIterator leftIter(data, numeric, left, equalPrefixLength); 1254 FCDUIterCollationIterator rightIter(data, numeric, right, equalPrefixLength); 1255 result = CollationCompare::compareUpToQuaternary(leftIter, rightIter, *settings, errorCode); 1256 } 1257 if(result != UCOL_EQUAL || settings->getStrength() < UCOL_IDENTICAL || U_FAILURE(errorCode)) { 1258 return result; 1259 } 1260 1261 // Compare identical level. 1262 left.move(&left, equalPrefixLength, UITER_ZERO); 1263 right.move(&right, equalPrefixLength, UITER_ZERO); 1264 const Normalizer2Impl &nfcImpl = data->nfcImpl; 1265 if(settings->dontCheckFCD()) { 1266 UIterNFDIterator leftIter(left); 1267 UIterNFDIterator rightIter(right); 1268 return compareNFDIter(nfcImpl, leftIter, rightIter); 1269 } else { 1270 FCDUIterNFDIterator leftIter(data, left, equalPrefixLength); 1271 FCDUIterNFDIterator rightIter(data, right, equalPrefixLength); 1272 return compareNFDIter(nfcImpl, leftIter, rightIter); 1273 } 1274 } 1275 1276 CollationKey & 1277 RuleBasedCollator::getCollationKey(const UnicodeString &s, CollationKey &key, 1278 UErrorCode &errorCode) const { 1279 return getCollationKey(s.getBuffer(), s.length(), key, errorCode); 1280 } 1281 1282 CollationKey & 1283 RuleBasedCollator::getCollationKey(const UChar *s, int32_t length, CollationKey& key, 1284 UErrorCode &errorCode) const { 1285 if(U_FAILURE(errorCode)) { 1286 return key.setToBogus(); 1287 } 1288 if(s == NULL && length != 0) { 1289 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 1290 return key.setToBogus(); 1291 } 1292 key.reset(); // resets the "bogus" state 1293 CollationKeyByteSink sink(key); 1294 writeSortKey(s, length, sink, errorCode); 1295 if(U_FAILURE(errorCode)) { 1296 key.setToBogus(); 1297 } else if(key.isBogus()) { 1298 errorCode = U_MEMORY_ALLOCATION_ERROR; 1299 } else { 1300 key.setLength(sink.NumberOfBytesAppended()); 1301 } 1302 return key; 1303 } 1304 1305 int32_t 1306 RuleBasedCollator::getSortKey(const UnicodeString &s, 1307 uint8_t *dest, int32_t capacity) const { 1308 return getSortKey(s.getBuffer(), s.length(), dest, capacity); 1309 } 1310 1311 int32_t 1312 RuleBasedCollator::getSortKey(const UChar *s, int32_t length, 1313 uint8_t *dest, int32_t capacity) const { 1314 if((s == NULL && length != 0) || capacity < 0 || (dest == NULL && capacity > 0)) { 1315 return 0; 1316 } 1317 uint8_t noDest[1] = { 0 }; 1318 if(dest == NULL) { 1319 // Distinguish pure preflighting from an allocation error. 1320 dest = noDest; 1321 capacity = 0; 1322 } 1323 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), capacity); 1324 UErrorCode errorCode = U_ZERO_ERROR; 1325 writeSortKey(s, length, sink, errorCode); 1326 return U_SUCCESS(errorCode) ? sink.NumberOfBytesAppended() : 0; 1327 } 1328 1329 void 1330 RuleBasedCollator::writeSortKey(const UChar *s, int32_t length, 1331 SortKeyByteSink &sink, UErrorCode &errorCode) const { 1332 if(U_FAILURE(errorCode)) { return; } 1333 const UChar *limit = (length >= 0) ? s + length : NULL; 1334 UBool numeric = settings->isNumeric(); 1335 CollationKeys::LevelCallback callback; 1336 if(settings->dontCheckFCD()) { 1337 UTF16CollationIterator iter(data, numeric, s, s, limit); 1338 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings, 1339 sink, Collation::PRIMARY_LEVEL, 1340 callback, TRUE, errorCode); 1341 } else { 1342 FCDUTF16CollationIterator iter(data, numeric, s, s, limit); 1343 CollationKeys::writeSortKeyUpToQuaternary(iter, data->compressibleBytes, *settings, 1344 sink, Collation::PRIMARY_LEVEL, 1345 callback, TRUE, errorCode); 1346 } 1347 if(settings->getStrength() == UCOL_IDENTICAL) { 1348 writeIdenticalLevel(s, limit, sink, errorCode); 1349 } 1350 static const char terminator = 0; // TERMINATOR_BYTE 1351 sink.Append(&terminator, 1); 1352 } 1353 1354 void 1355 RuleBasedCollator::writeIdenticalLevel(const UChar *s, const UChar *limit, 1356 SortKeyByteSink &sink, UErrorCode &errorCode) const { 1357 // NFD quick check 1358 const UChar *nfdQCYesLimit = data->nfcImpl.decompose(s, limit, NULL, errorCode); 1359 if(U_FAILURE(errorCode)) { return; } 1360 sink.Append(Collation::LEVEL_SEPARATOR_BYTE); 1361 UChar32 prev = 0; 1362 if(nfdQCYesLimit != s) { 1363 prev = u_writeIdenticalLevelRun(prev, s, (int32_t)(nfdQCYesLimit - s), sink); 1364 } 1365 // Is there non-NFD text? 1366 int32_t destLengthEstimate; 1367 if(limit != NULL) { 1368 if(nfdQCYesLimit == limit) { return; } 1369 destLengthEstimate = (int32_t)(limit - nfdQCYesLimit); 1370 } else { 1371 // s is NUL-terminated 1372 if(*nfdQCYesLimit == 0) { return; } 1373 destLengthEstimate = -1; 1374 } 1375 UnicodeString nfd; 1376 data->nfcImpl.decompose(nfdQCYesLimit, limit, nfd, destLengthEstimate, errorCode); 1377 u_writeIdenticalLevelRun(prev, nfd.getBuffer(), nfd.length(), sink); 1378 } 1379 1380 namespace { 1381 1382 /** 1383 * internalNextSortKeyPart() calls CollationKeys::writeSortKeyUpToQuaternary() 1384 * with an instance of this callback class. 1385 * When another level is about to be written, the callback 1386 * records the level and the number of bytes that will be written until 1387 * the sink (which is actually a FixedSortKeyByteSink) fills up. 1388 * 1389 * When internalNextSortKeyPart() is called again, it restarts with the last level 1390 * and ignores as many bytes as were written previously for that level. 1391 */ 1392 class PartLevelCallback : public CollationKeys::LevelCallback { 1393 public: 1394 PartLevelCallback(const SortKeyByteSink &s) 1395 : sink(s), level(Collation::PRIMARY_LEVEL) { 1396 levelCapacity = sink.GetRemainingCapacity(); 1397 } 1398 virtual ~PartLevelCallback() {} 1399 virtual UBool needToWrite(Collation::Level l) { 1400 if(!sink.Overflowed()) { 1401 // Remember a level that will be at least partially written. 1402 level = l; 1403 levelCapacity = sink.GetRemainingCapacity(); 1404 return TRUE; 1405 } else { 1406 return FALSE; 1407 } 1408 } 1409 Collation::Level getLevel() const { return level; } 1410 int32_t getLevelCapacity() const { return levelCapacity; } 1411 1412 private: 1413 const SortKeyByteSink &sink; 1414 Collation::Level level; 1415 int32_t levelCapacity; 1416 }; 1417 1418 } // namespace 1419 1420 int32_t 1421 RuleBasedCollator::internalNextSortKeyPart(UCharIterator *iter, uint32_t state[2], 1422 uint8_t *dest, int32_t count, UErrorCode &errorCode) const { 1423 if(U_FAILURE(errorCode)) { return 0; } 1424 if(iter == NULL || state == NULL || count < 0 || (count > 0 && dest == NULL)) { 1425 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 1426 return 0; 1427 } 1428 if(count == 0) { return 0; } 1429 1430 FixedSortKeyByteSink sink(reinterpret_cast<char *>(dest), count); 1431 sink.IgnoreBytes((int32_t)state[1]); 1432 iter->move(iter, 0, UITER_START); 1433 1434 Collation::Level level = (Collation::Level)state[0]; 1435 if(level <= Collation::QUATERNARY_LEVEL) { 1436 UBool numeric = settings->isNumeric(); 1437 PartLevelCallback callback(sink); 1438 if(settings->dontCheckFCD()) { 1439 UIterCollationIterator ci(data, numeric, *iter); 1440 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings, 1441 sink, level, callback, FALSE, errorCode); 1442 } else { 1443 FCDUIterCollationIterator ci(data, numeric, *iter, 0); 1444 CollationKeys::writeSortKeyUpToQuaternary(ci, data->compressibleBytes, *settings, 1445 sink, level, callback, FALSE, errorCode); 1446 } 1447 if(U_FAILURE(errorCode)) { return 0; } 1448 if(sink.NumberOfBytesAppended() > count) { 1449 state[0] = (uint32_t)callback.getLevel(); 1450 state[1] = (uint32_t)callback.getLevelCapacity(); 1451 return count; 1452 } 1453 // All of the normal levels are done. 1454 if(settings->getStrength() == UCOL_IDENTICAL) { 1455 level = Collation::IDENTICAL_LEVEL; 1456 iter->move(iter, 0, UITER_START); 1457 } 1458 // else fall through to setting ZERO_LEVEL 1459 } 1460 1461 if(level == Collation::IDENTICAL_LEVEL) { 1462 int32_t levelCapacity = sink.GetRemainingCapacity(); 1463 UnicodeString s; 1464 for(;;) { 1465 UChar32 c = iter->next(iter); 1466 if(c < 0) { break; } 1467 s.append((UChar)c); 1468 } 1469 const UChar *sArray = s.getBuffer(); 1470 writeIdenticalLevel(sArray, sArray + s.length(), sink, errorCode); 1471 if(U_FAILURE(errorCode)) { return 0; } 1472 if(sink.NumberOfBytesAppended() > count) { 1473 state[0] = (uint32_t)level; 1474 state[1] = (uint32_t)levelCapacity; 1475 return count; 1476 } 1477 } 1478 1479 // ZERO_LEVEL: Fill the remainder of dest with 00 bytes. 1480 state[0] = (uint32_t)Collation::ZERO_LEVEL; 1481 state[1] = 0; 1482 int32_t length = sink.NumberOfBytesAppended(); 1483 int32_t i = length; 1484 while(i < count) { dest[i++] = 0; } 1485 return length; 1486 } 1487 1488 void 1489 RuleBasedCollator::internalGetCEs(const UnicodeString &str, UVector64 &ces, 1490 UErrorCode &errorCode) const { 1491 if(U_FAILURE(errorCode)) { return; } 1492 const UChar *s = str.getBuffer(); 1493 const UChar *limit = s + str.length(); 1494 UBool numeric = settings->isNumeric(); 1495 if(settings->dontCheckFCD()) { 1496 UTF16CollationIterator iter(data, numeric, s, s, limit); 1497 int64_t ce; 1498 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) { 1499 ces.addElement(ce, errorCode); 1500 } 1501 } else { 1502 FCDUTF16CollationIterator iter(data, numeric, s, s, limit); 1503 int64_t ce; 1504 while((ce = iter.nextCE(errorCode)) != Collation::NO_CE) { 1505 ces.addElement(ce, errorCode); 1506 } 1507 } 1508 } 1509 1510 namespace { 1511 1512 void appendSubtag(CharString &s, char letter, const char *subtag, int32_t length, 1513 UErrorCode &errorCode) { 1514 if(U_FAILURE(errorCode) || length == 0) { return; } 1515 if(!s.isEmpty()) { 1516 s.append('_', errorCode); 1517 } 1518 s.append(letter, errorCode); 1519 for(int32_t i = 0; i < length; ++i) { 1520 s.append(uprv_toupper(subtag[i]), errorCode); 1521 } 1522 } 1523 1524 void appendAttribute(CharString &s, char letter, UColAttributeValue value, 1525 UErrorCode &errorCode) { 1526 if(U_FAILURE(errorCode)) { return; } 1527 if(!s.isEmpty()) { 1528 s.append('_', errorCode); 1529 } 1530 static const char *valueChars = "1234...........IXO..SN..LU......"; 1531 s.append(letter, errorCode); 1532 s.append(valueChars[value], errorCode); 1533 } 1534 1535 } // namespace 1536 1537 int32_t 1538 RuleBasedCollator::internalGetShortDefinitionString(const char *locale, 1539 char *buffer, int32_t capacity, 1540 UErrorCode &errorCode) const { 1541 if(U_FAILURE(errorCode)) { return 0; } 1542 if(buffer == NULL ? capacity != 0 : capacity < 0) { 1543 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 1544 return 0; 1545 } 1546 if(locale == NULL) { 1547 locale = internalGetLocaleID(ULOC_VALID_LOCALE, errorCode); 1548 } 1549 1550 char resultLocale[ULOC_FULLNAME_CAPACITY + 1]; 1551 int32_t length = ucol_getFunctionalEquivalent(resultLocale, ULOC_FULLNAME_CAPACITY, 1552 "collation", locale, 1553 NULL, &errorCode); 1554 if(U_FAILURE(errorCode)) { return 0; } 1555 if(length == 0) { 1556 uprv_strcpy(resultLocale, "root"); 1557 } else { 1558 resultLocale[length] = 0; 1559 } 1560 1561 // Append items in alphabetic order of their short definition letters. 1562 CharString result; 1563 char subtag[ULOC_KEYWORD_AND_VALUES_CAPACITY]; 1564 1565 if(attributeHasBeenSetExplicitly(UCOL_ALTERNATE_HANDLING)) { 1566 appendAttribute(result, 'A', getAttribute(UCOL_ALTERNATE_HANDLING, errorCode), errorCode); 1567 } 1568 // ATTR_VARIABLE_TOP not supported because 'B' was broken. 1569 // See ICU tickets #10372 and #10386. 1570 if(attributeHasBeenSetExplicitly(UCOL_CASE_FIRST)) { 1571 appendAttribute(result, 'C', getAttribute(UCOL_CASE_FIRST, errorCode), errorCode); 1572 } 1573 if(attributeHasBeenSetExplicitly(UCOL_NUMERIC_COLLATION)) { 1574 appendAttribute(result, 'D', getAttribute(UCOL_NUMERIC_COLLATION, errorCode), errorCode); 1575 } 1576 if(attributeHasBeenSetExplicitly(UCOL_CASE_LEVEL)) { 1577 appendAttribute(result, 'E', getAttribute(UCOL_CASE_LEVEL, errorCode), errorCode); 1578 } 1579 if(attributeHasBeenSetExplicitly(UCOL_FRENCH_COLLATION)) { 1580 appendAttribute(result, 'F', getAttribute(UCOL_FRENCH_COLLATION, errorCode), errorCode); 1581 } 1582 // Note: UCOL_HIRAGANA_QUATERNARY_MODE is deprecated and never changes away from default. 1583 length = uloc_getKeywordValue(resultLocale, "collation", subtag, UPRV_LENGTHOF(subtag), &errorCode); 1584 appendSubtag(result, 'K', subtag, length, errorCode); 1585 length = uloc_getLanguage(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode); 1586 appendSubtag(result, 'L', subtag, length, errorCode); 1587 if(attributeHasBeenSetExplicitly(UCOL_NORMALIZATION_MODE)) { 1588 appendAttribute(result, 'N', getAttribute(UCOL_NORMALIZATION_MODE, errorCode), errorCode); 1589 } 1590 length = uloc_getCountry(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode); 1591 appendSubtag(result, 'R', subtag, length, errorCode); 1592 if(attributeHasBeenSetExplicitly(UCOL_STRENGTH)) { 1593 appendAttribute(result, 'S', getAttribute(UCOL_STRENGTH, errorCode), errorCode); 1594 } 1595 length = uloc_getVariant(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode); 1596 appendSubtag(result, 'V', subtag, length, errorCode); 1597 length = uloc_getScript(resultLocale, subtag, UPRV_LENGTHOF(subtag), &errorCode); 1598 appendSubtag(result, 'Z', subtag, length, errorCode); 1599 1600 if(U_FAILURE(errorCode)) { return 0; } 1601 if(result.length() <= capacity) { 1602 uprv_memcpy(buffer, result.data(), result.length()); 1603 } 1604 return u_terminateChars(buffer, capacity, result.length(), &errorCode); 1605 } 1606 1607 UBool 1608 RuleBasedCollator::isUnsafe(UChar32 c) const { 1609 return data->isUnsafeBackward(c, settings->isNumeric()); 1610 } 1611 1612 void 1613 RuleBasedCollator::computeMaxExpansions(const CollationTailoring *t, UErrorCode &errorCode) { 1614 t->maxExpansions = CollationElementIterator::computeMaxExpansions(t->data, errorCode); 1615 } 1616 1617 UBool 1618 RuleBasedCollator::initMaxExpansions(UErrorCode &errorCode) const { 1619 umtx_initOnce(tailoring->maxExpansionsInitOnce, computeMaxExpansions, tailoring, errorCode); 1620 return U_SUCCESS(errorCode); 1621 } 1622 1623 CollationElementIterator * 1624 RuleBasedCollator::createCollationElementIterator(const UnicodeString& source) const { 1625 UErrorCode errorCode = U_ZERO_ERROR; 1626 if(!initMaxExpansions(errorCode)) { return NULL; } 1627 CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode); 1628 if(U_FAILURE(errorCode)) { 1629 delete cei; 1630 return NULL; 1631 } 1632 return cei; 1633 } 1634 1635 CollationElementIterator * 1636 RuleBasedCollator::createCollationElementIterator(const CharacterIterator& source) const { 1637 UErrorCode errorCode = U_ZERO_ERROR; 1638 if(!initMaxExpansions(errorCode)) { return NULL; } 1639 CollationElementIterator *cei = new CollationElementIterator(source, this, errorCode); 1640 if(U_FAILURE(errorCode)) { 1641 delete cei; 1642 return NULL; 1643 } 1644 return cei; 1645 } 1646 1647 int32_t 1648 RuleBasedCollator::getMaxExpansion(int32_t order) const { 1649 UErrorCode errorCode = U_ZERO_ERROR; 1650 (void)initMaxExpansions(errorCode); 1651 return CollationElementIterator::getMaxExpansion(tailoring->maxExpansions, order); 1652 } 1653 1654 U_NAMESPACE_END 1655 1656 #endif // !UCONFIG_NO_COLLATION 1657