1 /* 2 ****************************************************************************** 3 * Copyright (C) 1996-2013, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ****************************************************************************** 6 */ 7 8 /** 9 * File tblcoll.cpp 10 * 11 * Created by: Helena Shih 12 * 13 * Modification History: 14 * 15 * Date Name Description 16 * 2/5/97 aliu Added streamIn and streamOut methods. Added 17 * constructor which reads RuleBasedCollator object from 18 * a binary file. Added writeToFile method which streams 19 * RuleBasedCollator out to a binary file. The streamIn 20 * and streamOut methods use istream and ostream objects 21 * in binary mode. 22 * 2/11/97 aliu Moved declarations out of for loop initializer. 23 * Added Mac compatibility #ifdef for ios::nocreate. 24 * 2/12/97 aliu Modified to use TableCollationData sub-object to 25 * hold invariant data. 26 * 2/13/97 aliu Moved several methods into this class from Collation. 27 * Added a private RuleBasedCollator(Locale&) constructor, 28 * to be used by Collator::getInstance(). General 29 * clean up. Made use of UErrorCode variables consistent. 30 * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy 31 * constructor and getDynamicClassID. 32 * 3/5/97 aliu Changed compaction cycle to improve performance. We 33 * use the maximum allowable value which is kBlockCount. 34 * Modified getRules() to load rules dynamically. Changed 35 * constructFromFile() call to accomodate this (added 36 * parameter to specify whether binary loading is to 37 * take place). 38 * 05/06/97 helena Added memory allocation error check. 39 * 6/20/97 helena Java class name change. 40 * 6/23/97 helena Adding comments to make code more readable. 41 * 09/03/97 helena Added createCollationKeyValues(). 42 * 06/26/98 erm Changes for CollationKeys using byte arrays. 43 * 08/10/98 erm Synched with 1.2 version of RuleBasedCollator.java 44 * 04/23/99 stephen Removed EDecompositionMode, merged with 45 * Normalizer::EMode 46 * 06/14/99 stephen Removed kResourceBundleSuffix 47 * 06/22/99 stephen Fixed logic in constructFromFile() since .ctx 48 * files are no longer used. 49 * 11/02/99 helena Collator performance enhancements. Special case 50 * for NO_OP situations. 51 * 11/17/99 srl More performance enhancements. Inlined some internal functions. 52 * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator 53 * to implementation file. 54 * 01/29/01 synwee Modified into a C++ wrapper calling C APIs (ucol.h) 55 */ 56 57 #include "unicode/utypes.h" 58 59 #if !UCONFIG_NO_COLLATION 60 61 #include "unicode/tblcoll.h" 62 #include "unicode/coleitr.h" 63 #include "unicode/ures.h" 64 #include "unicode/uset.h" 65 #include "ucol_imp.h" 66 #include "uresimp.h" 67 #include "uhash.h" 68 #include "cmemory.h" 69 #include "cstring.h" 70 #include "putilimp.h" 71 #include "ustr_imp.h" 72 73 /* public RuleBasedCollator constructor ---------------------------------- */ 74 75 U_NAMESPACE_BEGIN 76 77 /** 78 * Copy constructor, aliasing, not write-through 79 */ 80 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that) 81 : Collator(that) 82 , dataIsOwned(FALSE) 83 , isWriteThroughAlias(FALSE) 84 , ucollator(NULL) 85 { 86 RuleBasedCollator::operator=(that); 87 } 88 89 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 90 UErrorCode& status) : 91 dataIsOwned(FALSE) 92 { 93 construct(rules, 94 UCOL_DEFAULT_STRENGTH, 95 UCOL_DEFAULT, 96 status); 97 } 98 99 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 100 ECollationStrength collationStrength, 101 UErrorCode& status) : dataIsOwned(FALSE) 102 { 103 construct(rules, 104 (UColAttributeValue)collationStrength, 105 UCOL_DEFAULT, 106 status); 107 } 108 109 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 110 UColAttributeValue decompositionMode, 111 UErrorCode& status) : 112 dataIsOwned(FALSE) 113 { 114 construct(rules, 115 UCOL_DEFAULT_STRENGTH, 116 decompositionMode, 117 status); 118 } 119 120 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 121 ECollationStrength collationStrength, 122 UColAttributeValue decompositionMode, 123 UErrorCode& status) : dataIsOwned(FALSE) 124 { 125 construct(rules, 126 (UColAttributeValue)collationStrength, 127 decompositionMode, 128 status); 129 } 130 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length, 131 const RuleBasedCollator *base, 132 UErrorCode &status) : 133 dataIsOwned(TRUE), 134 isWriteThroughAlias(FALSE) 135 { 136 ucollator = ucol_openBinary(bin, length, base->ucollator, &status); 137 } 138 139 void 140 RuleBasedCollator::setRuleStringFromCollator() 141 { 142 int32_t length; 143 const UChar *r = ucol_getRules(ucollator, &length); 144 145 if (r && length > 0) { 146 // alias the rules string 147 urulestring.setTo(TRUE, r, length); 148 } 149 else { 150 urulestring.truncate(0); // Clear string. 151 } 152 } 153 154 // not aliasing, not write-through 155 void 156 RuleBasedCollator::construct(const UnicodeString& rules, 157 UColAttributeValue collationStrength, 158 UColAttributeValue decompositionMode, 159 UErrorCode& status) 160 { 161 ucollator = ucol_openRules(rules.getBuffer(), rules.length(), 162 decompositionMode, collationStrength, 163 NULL, &status); 164 165 dataIsOwned = TRUE; // since we own a collator now, we need to get rid of it 166 isWriteThroughAlias = FALSE; 167 168 if(ucollator == NULL) { 169 if(U_SUCCESS(status)) { 170 status = U_MEMORY_ALLOCATION_ERROR; 171 } 172 return; // Failure 173 } 174 175 setRuleStringFromCollator(); 176 } 177 178 /* RuleBasedCollator public destructor ----------------------------------- */ 179 180 RuleBasedCollator::~RuleBasedCollator() 181 { 182 if (dataIsOwned) 183 { 184 ucol_close(ucollator); 185 } 186 ucollator = 0; 187 } 188 189 /* RuleBaseCollator public methods --------------------------------------- */ 190 191 UBool RuleBasedCollator::operator==(const Collator& that) const 192 { 193 /* only checks for address equals here */ 194 if (this == &that) { 195 return TRUE; 196 } 197 if (!Collator::operator==(that)) { 198 return FALSE; /* not the same class */ 199 } 200 201 RuleBasedCollator& thatAlias = (RuleBasedCollator&)that; 202 203 return ucol_equals(this->ucollator, thatAlias.ucollator); 204 } 205 206 // aliasing, not write-through 207 RuleBasedCollator& RuleBasedCollator::operator=(const RuleBasedCollator& that) 208 { 209 if (this == &that) { return *this; } 210 211 UErrorCode intStatus = U_ZERO_ERROR; 212 UCollator *ucol = ucol_safeClone(that.ucollator, NULL, NULL, &intStatus); 213 if (U_FAILURE(intStatus)) { return *this; } 214 215 if (dataIsOwned) { 216 ucol_close(ucollator); 217 } 218 ucollator = ucol; 219 dataIsOwned = TRUE; 220 isWriteThroughAlias = FALSE; 221 setRuleStringFromCollator(); 222 return *this; 223 } 224 225 // aliasing, not write-through 226 Collator* RuleBasedCollator::clone() const 227 { 228 RuleBasedCollator* coll = new RuleBasedCollator(*this); 229 // There is a small chance that the internal ucol_safeClone() call fails. 230 if (coll != NULL && coll->ucollator == NULL) { 231 delete coll; 232 return NULL; 233 } 234 return coll; 235 } 236 237 238 CollationElementIterator* RuleBasedCollator::createCollationElementIterator 239 (const UnicodeString& source) const 240 { 241 UErrorCode status = U_ZERO_ERROR; 242 CollationElementIterator *result = new CollationElementIterator(source, this, 243 status); 244 if (U_FAILURE(status)) { 245 delete result; 246 return NULL; 247 } 248 249 return result; 250 } 251 252 /** 253 * Create a CollationElementIterator object that will iterate over the 254 * elements in a string, using the collation rules defined in this 255 * RuleBasedCollator 256 */ 257 CollationElementIterator* RuleBasedCollator::createCollationElementIterator 258 (const CharacterIterator& source) const 259 { 260 UErrorCode status = U_ZERO_ERROR; 261 CollationElementIterator *result = new CollationElementIterator(source, this, 262 status); 263 264 if (U_FAILURE(status)) { 265 delete result; 266 return NULL; 267 } 268 269 return result; 270 } 271 272 /** 273 * Return a string representation of this collator's rules. The string can 274 * later be passed to the constructor that takes a UnicodeString argument, 275 * which will construct a collator that's functionally identical to this one. 276 * You can also allow users to edit the string in order to change the collation 277 * data, or you can print it out for inspection, or whatever. 278 */ 279 const UnicodeString& RuleBasedCollator::getRules() const 280 { 281 return urulestring; 282 } 283 284 void RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) 285 { 286 int32_t rulesize = ucol_getRulesEx(ucollator, delta, NULL, -1); 287 288 if (rulesize > 0) { 289 UChar *rules = (UChar*) uprv_malloc( sizeof(UChar) * (rulesize) ); 290 if(rules != NULL) { 291 ucol_getRulesEx(ucollator, delta, rules, rulesize); 292 buffer.setTo(rules, rulesize); 293 uprv_free(rules); 294 } else { // couldn't allocate 295 buffer.remove(); 296 } 297 } 298 else { 299 buffer.remove(); 300 } 301 } 302 303 UnicodeSet * 304 RuleBasedCollator::getTailoredSet(UErrorCode &status) const 305 { 306 if(U_FAILURE(status)) { 307 return NULL; 308 } 309 return (UnicodeSet *)ucol_getTailoredSet(this->ucollator, &status); 310 } 311 312 313 void RuleBasedCollator::getVersion(UVersionInfo versionInfo) const 314 { 315 if (versionInfo!=NULL){ 316 ucol_getVersion(ucollator, versionInfo); 317 } 318 } 319 320 /** 321 * Compare two strings using this collator 322 */ 323 UCollationResult RuleBasedCollator::compare( 324 const UnicodeString& source, 325 const UnicodeString& target, 326 int32_t length, 327 UErrorCode &status) const 328 { 329 return compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status); 330 } 331 332 UCollationResult RuleBasedCollator::compare(const UChar* source, 333 int32_t sourceLength, 334 const UChar* target, 335 int32_t targetLength, 336 UErrorCode &status) const 337 { 338 if(U_SUCCESS(status)) { 339 return ucol_strcoll(ucollator, source, sourceLength, target, targetLength); 340 } else { 341 return UCOL_EQUAL; 342 } 343 } 344 345 UCollationResult RuleBasedCollator::compare( 346 const UnicodeString& source, 347 const UnicodeString& target, 348 UErrorCode &status) const 349 { 350 if(U_SUCCESS(status)) { 351 return ucol_strcoll(ucollator, source.getBuffer(), source.length(), 352 target.getBuffer(), target.length()); 353 } else { 354 return UCOL_EQUAL; 355 } 356 } 357 358 UCollationResult RuleBasedCollator::compare(UCharIterator &sIter, 359 UCharIterator &tIter, 360 UErrorCode &status) const { 361 if(U_SUCCESS(status)) { 362 return ucol_strcollIter(ucollator, &sIter, &tIter, &status); 363 } else { 364 return UCOL_EQUAL; 365 } 366 } 367 368 /** 369 * Retrieve a collation key for the specified string. The key can be compared 370 * with other collation keys using a bitwise comparison (e.g. memcmp) to find 371 * the ordering of their respective source strings. This is handy when doing a 372 * sort, where each sort key must be compared many times. 373 * 374 * The basic algorithm here is to find all of the collation elements for each 375 * character in the source string, convert them to an ASCII representation, and 376 * put them into the collation key. But it's trickier than that. Each 377 * collation element in a string has three components: primary ('A' vs 'B'), 378 * secondary ('u' vs '\u00FC'), and tertiary ('A' vs 'a'), and a primary difference 379 * at the end of a string takes precedence over a secondary or tertiary 380 * difference earlier in the string. 381 * 382 * To account for this, we put all of the primary orders at the beginning of 383 * the string, followed by the secondary and tertiary orders. Each set of 384 * orders is terminated by nulls so that a key for a string which is a initial 385 * substring of another key will compare less without any special case. 386 * 387 * Here's a hypothetical example, with the collation element represented as a 388 * three-digit number, one digit for primary, one for secondary, etc. 389 * 390 * String: A a B \u00C9 391 * Collation Elements: 101 100 201 511 392 * Collation Key: 1125<null>0001<null>1011<null> 393 * 394 * To make things even trickier, secondary differences (accent marks) are 395 * compared starting at the *end* of the string in languages with French 396 * secondary ordering. But when comparing the accent marks on a single base 397 * character, they are compared from the beginning. To handle this, we reverse 398 * all of the accents that belong to each base character, then we reverse the 399 * entire string of secondary orderings at the end. 400 */ 401 CollationKey& RuleBasedCollator::getCollationKey( 402 const UnicodeString& source, 403 CollationKey& sortkey, 404 UErrorCode& status) const 405 { 406 return getCollationKey(source.getBuffer(), source.length(), sortkey, status); 407 } 408 409 CollationKey& RuleBasedCollator::getCollationKey(const UChar* source, 410 int32_t sourceLen, 411 CollationKey& sortkey, 412 UErrorCode& status) const 413 { 414 if (U_FAILURE(status)) { 415 return sortkey.setToBogus(); 416 } 417 if (sourceLen < -1 || (source == NULL && sourceLen != 0)) { 418 status = U_ILLEGAL_ARGUMENT_ERROR; 419 return sortkey.setToBogus(); 420 } 421 422 if (sourceLen < 0) { 423 sourceLen = u_strlen(source); 424 } 425 if (sourceLen == 0) { 426 return sortkey.reset(); 427 } 428 429 int32_t resultLen = ucol_getCollationKey(ucollator, source, sourceLen, sortkey, status); 430 431 if (U_SUCCESS(status)) { 432 sortkey.setLength(resultLen); 433 } else { 434 sortkey.setToBogus(); 435 } 436 return sortkey; 437 } 438 439 /** 440 * Return the maximum length of any expansion sequences that end with the 441 * specified comparison order. 442 * @param order a collation order returned by previous or next. 443 * @return the maximum length of any expansion seuences ending with the 444 * specified order or 1 if collation order does not occur at the end of any 445 * expansion sequence. 446 * @see CollationElementIterator#getMaxExpansion 447 */ 448 int32_t RuleBasedCollator::getMaxExpansion(int32_t order) const 449 { 450 uint8_t result; 451 UCOL_GETMAXEXPANSION(ucollator, (uint32_t)order, result); 452 return result; 453 } 454 455 uint8_t* RuleBasedCollator::cloneRuleData(int32_t &length, 456 UErrorCode &status) 457 { 458 if (U_FAILURE(status)) { return NULL; } 459 LocalMemory<uint8_t> buffer((uint8_t *)uprv_malloc(20000)); 460 if (buffer.isNull()) { 461 status = U_MEMORY_ALLOCATION_ERROR; 462 return NULL; 463 } 464 length = cloneBinary(buffer.getAlias(), 20000, status); 465 if (status == U_BUFFER_OVERFLOW_ERROR) { 466 if (buffer.allocateInsteadAndCopy(length, 0) == NULL) { 467 status = U_MEMORY_ALLOCATION_ERROR; 468 return NULL; 469 } 470 status = U_ZERO_ERROR; 471 length = cloneBinary(buffer.getAlias(), length, status); 472 } 473 if (U_FAILURE(status)) { return NULL; } 474 return buffer.orphan(); 475 } 476 477 478 int32_t RuleBasedCollator::cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) 479 { 480 return ucol_cloneBinary(ucollator, buffer, capacity, &status); 481 } 482 483 void RuleBasedCollator::setAttribute(UColAttribute attr, 484 UColAttributeValue value, 485 UErrorCode &status) 486 { 487 if (U_FAILURE(status)) 488 return; 489 checkOwned(); 490 ucol_setAttribute(ucollator, attr, value, &status); 491 } 492 493 UColAttributeValue RuleBasedCollator::getAttribute(UColAttribute attr, 494 UErrorCode &status) const 495 { 496 if (U_FAILURE(status)) 497 return UCOL_DEFAULT; 498 return ucol_getAttribute(ucollator, attr, &status); 499 } 500 501 uint32_t RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status) { 502 checkOwned(); 503 return ucol_setVariableTop(ucollator, varTop, len, &status); 504 } 505 506 uint32_t RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &status) { 507 checkOwned(); 508 return ucol_setVariableTop(ucollator, varTop.getBuffer(), varTop.length(), &status); 509 } 510 511 void RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &status) { 512 checkOwned(); 513 ucol_restoreVariableTop(ucollator, varTop, &status); 514 } 515 516 uint32_t RuleBasedCollator::getVariableTop(UErrorCode &status) const { 517 return ucol_getVariableTop(ucollator, &status); 518 } 519 520 int32_t RuleBasedCollator::getSortKey(const UnicodeString& source, 521 uint8_t *result, int32_t resultLength) 522 const 523 { 524 return ucol_getSortKey(ucollator, source.getBuffer(), source.length(), result, resultLength); 525 } 526 527 int32_t RuleBasedCollator::getSortKey(const UChar *source, 528 int32_t sourceLength, uint8_t *result, 529 int32_t resultLength) const 530 { 531 return ucol_getSortKey(ucollator, source, sourceLength, result, resultLength); 532 } 533 534 int32_t RuleBasedCollator::getReorderCodes(int32_t *dest, 535 int32_t destCapacity, 536 UErrorCode& status) const 537 { 538 return ucol_getReorderCodes(ucollator, dest, destCapacity, &status); 539 } 540 541 void RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, 542 int32_t reorderCodesLength, 543 UErrorCode& status) 544 { 545 checkOwned(); 546 ucol_setReorderCodes(ucollator, reorderCodes, reorderCodesLength, &status); 547 } 548 549 int32_t RuleBasedCollator::getEquivalentReorderCodes(int32_t reorderCode, 550 int32_t* dest, 551 int32_t destCapacity, 552 UErrorCode& status) 553 { 554 return ucol_getEquivalentReorderCodes(reorderCode, dest, destCapacity, &status); 555 } 556 557 /** 558 * Create a hash code for this collation. Just hash the main rule table -- that 559 * should be good enough for almost any use. 560 */ 561 int32_t RuleBasedCollator::hashCode() const 562 { 563 int32_t length; 564 const UChar *rules = ucol_getRules(ucollator, &length); 565 return ustr_hashUCharsN(rules, length); 566 } 567 568 /** 569 * return the locale of this collator 570 */ 571 Locale RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode &status) const { 572 const char *result = ucol_getLocaleByType(ucollator, type, &status); 573 if(result == NULL) { 574 Locale res(""); 575 res.setToBogus(); 576 return res; 577 } else { 578 return Locale(result); 579 } 580 } 581 582 void 583 RuleBasedCollator::setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale) { 584 checkOwned(); 585 char* rloc = uprv_strdup(requestedLocale.getName()); 586 if (rloc) { 587 char* vloc = uprv_strdup(validLocale.getName()); 588 if (vloc) { 589 char* aloc = uprv_strdup(actualLocale.getName()); 590 if (aloc) { 591 ucol_setReqValidLocales(ucollator, rloc, vloc, aloc); 592 return; 593 } 594 uprv_free(vloc); 595 } 596 uprv_free(rloc); 597 } 598 } 599 600 // RuleBaseCollatorNew private constructor ---------------------------------- 601 602 RuleBasedCollator::RuleBasedCollator() 603 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL) 604 { 605 } 606 607 RuleBasedCollator::RuleBasedCollator(const Locale& desiredLocale, 608 UErrorCode& status) 609 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL) 610 { 611 if (U_FAILURE(status)) 612 return; 613 614 /* 615 Try to load, in order: 616 1. The desired locale's collation. 617 2. A fallback of the desired locale. 618 3. The default locale's collation. 619 4. A fallback of the default locale. 620 5. The default collation rules, which contains en_US collation rules. 621 622 To reiterate, we try: 623 Specific: 624 language+country+variant 625 language+country 626 language 627 Default: 628 language+country+variant 629 language+country 630 language 631 Root: (aka DEFAULTRULES) 632 steps 1-5 are handled by resource bundle fallback mechanism. 633 however, in a very unprobable situation that no resource bundle 634 data exists, step 5 is repeated with hardcoded default rules. 635 */ 636 637 setUCollator(desiredLocale, status); 638 639 if (U_FAILURE(status)) 640 { 641 status = U_ZERO_ERROR; 642 643 setUCollator(kRootLocaleName, status); 644 if (status == U_ZERO_ERROR) { 645 status = U_USING_DEFAULT_WARNING; 646 } 647 } 648 649 if (U_SUCCESS(status)) 650 { 651 setRuleStringFromCollator(); 652 } 653 } 654 655 void 656 RuleBasedCollator::setUCollator(const char *locale, 657 UErrorCode &status) 658 { 659 if (U_FAILURE(status)) { 660 return; 661 } 662 if (ucollator && dataIsOwned) 663 ucol_close(ucollator); 664 ucollator = ucol_open_internal(locale, &status); 665 dataIsOwned = TRUE; 666 isWriteThroughAlias = FALSE; 667 } 668 669 670 void 671 RuleBasedCollator::checkOwned() { 672 if (!(dataIsOwned || isWriteThroughAlias)) { 673 UErrorCode status = U_ZERO_ERROR; 674 ucollator = ucol_safeClone(ucollator, NULL, NULL, &status); 675 setRuleStringFromCollator(); 676 dataIsOwned = TRUE; 677 isWriteThroughAlias = FALSE; 678 } 679 } 680 681 682 int32_t RuleBasedCollator::internalGetShortDefinitionString(const char *locale, 683 char *buffer, 684 int32_t capacity, 685 UErrorCode &status) const { 686 /* simply delegate */ 687 return ucol_getShortDefinitionString(ucollator, locale, buffer, capacity, &status); 688 } 689 690 691 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator) 692 693 U_NAMESPACE_END 694 695 #endif /* #if !UCONFIG_NO_COLLATION */ 696