1 /* 2 ****************************************************************************** 3 * Copyright (C) 1996-2010, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ****************************************************************************** 6 */ 7 8 /** 9 * File tblcoll.cpp 10 * 11 * Created by: Helena Shih 12 * 13 * Modification History: 14 * 15 * Date Name Description 16 * 2/5/97 aliu Added streamIn and streamOut methods. Added 17 * constructor which reads RuleBasedCollator object from 18 * a binary file. Added writeToFile method which streams 19 * RuleBasedCollator out to a binary file. The streamIn 20 * and streamOut methods use istream and ostream objects 21 * in binary mode. 22 * 2/11/97 aliu Moved declarations out of for loop initializer. 23 * Added Mac compatibility #ifdef for ios::nocreate. 24 * 2/12/97 aliu Modified to use TableCollationData sub-object to 25 * hold invariant data. 26 * 2/13/97 aliu Moved several methods into this class from Collation. 27 * Added a private RuleBasedCollator(Locale&) constructor, 28 * to be used by Collator::getInstance(). General 29 * clean up. Made use of UErrorCode variables consistent. 30 * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy 31 * constructor and getDynamicClassID. 32 * 3/5/97 aliu Changed compaction cycle to improve performance. We 33 * use the maximum allowable value which is kBlockCount. 34 * Modified getRules() to load rules dynamically. Changed 35 * constructFromFile() call to accomodate this (added 36 * parameter to specify whether binary loading is to 37 * take place). 38 * 05/06/97 helena Added memory allocation error check. 39 * 6/20/97 helena Java class name change. 40 * 6/23/97 helena Adding comments to make code more readable. 41 * 09/03/97 helena Added createCollationKeyValues(). 42 * 06/26/98 erm Changes for CollationKeys using byte arrays. 43 * 08/10/98 erm Synched with 1.2 version of RuleBasedCollator.java 44 * 04/23/99 stephen Removed EDecompositionMode, merged with 45 * Normalizer::EMode 46 * 06/14/99 stephen Removed kResourceBundleSuffix 47 * 06/22/99 stephen Fixed logic in constructFromFile() since .ctx 48 * files are no longer used. 49 * 11/02/99 helena Collator performance enhancements. Special case 50 * for NO_OP situations. 51 * 11/17/99 srl More performance enhancements. Inlined some internal functions. 52 * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator 53 * to implementation file. 54 * 01/29/01 synwee Modified into a C++ wrapper calling C APIs (ucol.h) 55 */ 56 57 #include <typeinfo> // for 'typeid' to work 58 59 #include "unicode/utypes.h" 60 61 #if !UCONFIG_NO_COLLATION 62 63 #include "unicode/tblcoll.h" 64 #include "unicode/coleitr.h" 65 #include "unicode/ures.h" 66 #include "unicode/uset.h" 67 #include "ucol_imp.h" 68 #include "uresimp.h" 69 #include "uhash.h" 70 #include "cmemory.h" 71 #include "cstring.h" 72 #include "putilimp.h" 73 74 /* public RuleBasedCollator constructor ---------------------------------- */ 75 76 U_NAMESPACE_BEGIN 77 78 /** 79 * Copy constructor, aliasing, not write-through 80 */ 81 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that) 82 : Collator(that) 83 , dataIsOwned(FALSE) 84 , isWriteThroughAlias(FALSE) 85 , ucollator(NULL) 86 { 87 RuleBasedCollator::operator=(that); 88 } 89 90 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 91 UErrorCode& status) : 92 dataIsOwned(FALSE) 93 { 94 construct(rules, 95 UCOL_DEFAULT_STRENGTH, 96 UCOL_DEFAULT, 97 status); 98 } 99 100 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 101 ECollationStrength collationStrength, 102 UErrorCode& status) : dataIsOwned(FALSE) 103 { 104 construct(rules, 105 getUCollationStrength(collationStrength), 106 UCOL_DEFAULT, 107 status); 108 } 109 110 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 111 UColAttributeValue decompositionMode, 112 UErrorCode& status) : 113 dataIsOwned(FALSE) 114 { 115 construct(rules, 116 UCOL_DEFAULT_STRENGTH, 117 decompositionMode, 118 status); 119 } 120 121 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 122 ECollationStrength collationStrength, 123 UColAttributeValue decompositionMode, 124 UErrorCode& status) : dataIsOwned(FALSE) 125 { 126 construct(rules, 127 getUCollationStrength(collationStrength), 128 decompositionMode, 129 status); 130 } 131 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length, 132 const RuleBasedCollator *base, 133 UErrorCode &status) : 134 dataIsOwned(TRUE), 135 isWriteThroughAlias(FALSE) 136 { 137 ucollator = ucol_openBinary(bin, length, base->ucollator, &status); 138 } 139 140 void 141 RuleBasedCollator::setRuleStringFromCollator() 142 { 143 int32_t length; 144 const UChar *r = ucol_getRules(ucollator, &length); 145 146 if (r && length > 0) { 147 // alias the rules string 148 urulestring.setTo(TRUE, r, length); 149 } 150 else { 151 urulestring.truncate(0); // Clear string. 152 } 153 } 154 155 // not aliasing, not write-through 156 void 157 RuleBasedCollator::construct(const UnicodeString& rules, 158 UColAttributeValue collationStrength, 159 UColAttributeValue decompositionMode, 160 UErrorCode& status) 161 { 162 ucollator = ucol_openRules(rules.getBuffer(), rules.length(), 163 decompositionMode, collationStrength, 164 NULL, &status); 165 166 dataIsOwned = TRUE; // since we own a collator now, we need to get rid of it 167 isWriteThroughAlias = FALSE; 168 169 if(ucollator == NULL) { 170 if(U_SUCCESS(status)) { 171 status = U_MEMORY_ALLOCATION_ERROR; 172 } 173 return; // Failure 174 } 175 176 setRuleStringFromCollator(); 177 } 178 179 /* RuleBasedCollator public destructor ----------------------------------- */ 180 181 RuleBasedCollator::~RuleBasedCollator() 182 { 183 if (dataIsOwned) 184 { 185 ucol_close(ucollator); 186 } 187 ucollator = 0; 188 } 189 190 /* RuleBaseCollator public methods --------------------------------------- */ 191 192 UBool RuleBasedCollator::operator==(const Collator& that) const 193 { 194 /* only checks for address equals here */ 195 if (Collator::operator==(that)) 196 return TRUE; 197 198 if (typeid(*this) != typeid(that)) 199 return FALSE; /* not the same class */ 200 201 RuleBasedCollator& thatAlias = (RuleBasedCollator&)that; 202 203 // weiv: use C function, commented code below is wrong 204 return ucol_equals(this->ucollator, thatAlias.ucollator); 205 /* 206 synwee : orginal code does not check for data compatibility 207 */ 208 /* 209 if (ucollator != thatAlias.ucollator) 210 return FALSE; 211 212 return TRUE; 213 */ 214 } 215 216 UBool RuleBasedCollator::operator!=(const Collator& other) const 217 { 218 return !(*this == other); 219 } 220 221 // aliasing, not write-through 222 RuleBasedCollator& RuleBasedCollator::operator=(const RuleBasedCollator& that) 223 { 224 if (this != &that) 225 { 226 if (dataIsOwned) 227 { 228 ucol_close(ucollator); 229 } 230 231 urulestring.truncate(0); // empty the rule string 232 dataIsOwned = TRUE; 233 isWriteThroughAlias = FALSE; 234 235 UErrorCode intStatus = U_ZERO_ERROR; 236 int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE; 237 ucollator = ucol_safeClone(that.ucollator, NULL, &buffersize, 238 &intStatus); 239 if (U_SUCCESS(intStatus)) { 240 setRuleStringFromCollator(); 241 } 242 } 243 return *this; 244 } 245 246 // aliasing, not write-through 247 Collator* RuleBasedCollator::clone() const 248 { 249 return new RuleBasedCollator(*this); 250 } 251 252 CollationElementIterator* RuleBasedCollator::createCollationElementIterator 253 (const UnicodeString& source) const 254 { 255 UErrorCode status = U_ZERO_ERROR; 256 CollationElementIterator *result = new CollationElementIterator(source, this, 257 status); 258 if (U_FAILURE(status)) { 259 delete result; 260 return NULL; 261 } 262 263 return result; 264 } 265 266 /** 267 * Create a CollationElementIterator object that will iterate over the 268 * elements in a string, using the collation rules defined in this 269 * RuleBasedCollator 270 */ 271 CollationElementIterator* RuleBasedCollator::createCollationElementIterator 272 (const CharacterIterator& source) const 273 { 274 UErrorCode status = U_ZERO_ERROR; 275 CollationElementIterator *result = new CollationElementIterator(source, this, 276 status); 277 278 if (U_FAILURE(status)) { 279 delete result; 280 return NULL; 281 } 282 283 return result; 284 } 285 286 /** 287 * Return a string representation of this collator's rules. The string can 288 * later be passed to the constructor that takes a UnicodeString argument, 289 * which will construct a collator that's functionally identical to this one. 290 * You can also allow users to edit the string in order to change the collation 291 * data, or you can print it out for inspection, or whatever. 292 */ 293 const UnicodeString& RuleBasedCollator::getRules() const 294 { 295 return urulestring; 296 } 297 298 void RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) 299 { 300 int32_t rulesize = ucol_getRulesEx(ucollator, delta, NULL, -1); 301 302 if (rulesize > 0) { 303 UChar *rules = (UChar*) uprv_malloc( sizeof(UChar) * (rulesize) ); 304 if(rules != NULL) { 305 ucol_getRulesEx(ucollator, delta, rules, rulesize); 306 buffer.setTo(rules, rulesize); 307 uprv_free(rules); 308 } else { // couldn't allocate 309 buffer.remove(); 310 } 311 } 312 else { 313 buffer.remove(); 314 } 315 } 316 317 UnicodeSet * 318 RuleBasedCollator::getTailoredSet(UErrorCode &status) const 319 { 320 if(U_FAILURE(status)) { 321 return NULL; 322 } 323 return (UnicodeSet *)ucol_getTailoredSet(this->ucollator, &status); 324 } 325 326 327 void RuleBasedCollator::getVersion(UVersionInfo versionInfo) const 328 { 329 if (versionInfo!=NULL){ 330 ucol_getVersion(ucollator, versionInfo); 331 } 332 } 333 334 Collator::EComparisonResult RuleBasedCollator::compare( 335 const UnicodeString& source, 336 const UnicodeString& target, 337 int32_t length) const 338 { 339 UErrorCode status = U_ZERO_ERROR; 340 return getEComparisonResult(compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status)); 341 } 342 343 UCollationResult RuleBasedCollator::compare( 344 const UnicodeString& source, 345 const UnicodeString& target, 346 int32_t length, 347 UErrorCode &status) const 348 { 349 return compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status); 350 } 351 352 Collator::EComparisonResult RuleBasedCollator::compare(const UChar* source, 353 int32_t sourceLength, 354 const UChar* target, 355 int32_t targetLength) 356 const 357 { 358 return getEComparisonResult(ucol_strcoll(ucollator, source, sourceLength, 359 target, targetLength)); 360 } 361 362 UCollationResult RuleBasedCollator::compare(const UChar* source, 363 int32_t sourceLength, 364 const UChar* target, 365 int32_t targetLength, 366 UErrorCode &status) const 367 { 368 if(U_SUCCESS(status)) { 369 return ucol_strcoll(ucollator, source, sourceLength, target, targetLength); 370 } else { 371 return UCOL_EQUAL; 372 } 373 } 374 375 /** 376 * Compare two strings using this collator 377 */ 378 Collator::EComparisonResult RuleBasedCollator::compare( 379 const UnicodeString& source, 380 const UnicodeString& target) const 381 { 382 return getEComparisonResult(ucol_strcoll(ucollator, source.getBuffer(), source.length(), 383 target.getBuffer(), target.length())); 384 } 385 386 UCollationResult RuleBasedCollator::compare( 387 const UnicodeString& source, 388 const UnicodeString& target, 389 UErrorCode &status) const 390 { 391 if(U_SUCCESS(status)) { 392 return ucol_strcoll(ucollator, source.getBuffer(), source.length(), 393 target.getBuffer(), target.length()); 394 } else { 395 return UCOL_EQUAL; 396 } 397 } 398 399 UCollationResult RuleBasedCollator::compare(UCharIterator &sIter, 400 UCharIterator &tIter, 401 UErrorCode &status) const { 402 if(U_SUCCESS(status)) { 403 return ucol_strcollIter(ucollator, &sIter, &tIter, &status); 404 } else { 405 return UCOL_EQUAL; 406 } 407 } 408 409 /** 410 * Retrieve a collation key for the specified string. The key can be compared 411 * with other collation keys using a bitwise comparison (e.g. memcmp) to find 412 * the ordering of their respective source strings. This is handy when doing a 413 * sort, where each sort key must be compared many times. 414 * 415 * The basic algorithm here is to find all of the collation elements for each 416 * character in the source string, convert them to an ASCII representation, and 417 * put them into the collation key. But it's trickier than that. Each 418 * collation element in a string has three components: primary ('A' vs 'B'), 419 * secondary ('u' vs '\u00FC'), and tertiary ('A' vs 'a'), and a primary difference 420 * at the end of a string takes precedence over a secondary or tertiary 421 * difference earlier in the string. 422 * 423 * To account for this, we put all of the primary orders at the beginning of 424 * the string, followed by the secondary and tertiary orders. Each set of 425 * orders is terminated by nulls so that a key for a string which is a initial 426 * substring of another key will compare less without any special case. 427 * 428 * Here's a hypothetical example, with the collation element represented as a 429 * three-digit number, one digit for primary, one for secondary, etc. 430 * 431 * String: A a B \u00C9 432 * Collation Elements: 101 100 201 511 433 * Collation Key: 1125<null>0001<null>1011<null> 434 * 435 * To make things even trickier, secondary differences (accent marks) are 436 * compared starting at the *end* of the string in languages with French 437 * secondary ordering. But when comparing the accent marks on a single base 438 * character, they are compared from the beginning. To handle this, we reverse 439 * all of the accents that belong to each base character, then we reverse the 440 * entire string of secondary orderings at the end. 441 */ 442 CollationKey& RuleBasedCollator::getCollationKey( 443 const UnicodeString& source, 444 CollationKey& sortkey, 445 UErrorCode& status) const 446 { 447 return getCollationKey(source.getBuffer(), source.length(), sortkey, status); 448 } 449 450 CollationKey& RuleBasedCollator::getCollationKey(const UChar* source, 451 int32_t sourceLen, 452 CollationKey& sortkey, 453 UErrorCode& status) const 454 { 455 if (U_FAILURE(status)) 456 { 457 return sortkey.setToBogus(); 458 } 459 460 if ((!source) || (sourceLen == 0)) { 461 return sortkey.reset(); 462 } 463 464 uint8_t *result; 465 int32_t resultLen = ucol_getSortKeyWithAllocation(ucollator, 466 source, sourceLen, 467 &result, 468 &status); 469 sortkey.adopt(result, resultLen); 470 return sortkey; 471 } 472 473 /** 474 * Return the maximum length of any expansion sequences that end with the 475 * specified comparison order. 476 * @param order a collation order returned by previous or next. 477 * @return the maximum length of any expansion seuences ending with the 478 * specified order or 1 if collation order does not occur at the end of any 479 * expansion sequence. 480 * @see CollationElementIterator#getMaxExpansion 481 */ 482 int32_t RuleBasedCollator::getMaxExpansion(int32_t order) const 483 { 484 uint8_t result; 485 UCOL_GETMAXEXPANSION(ucollator, (uint32_t)order, result); 486 return result; 487 } 488 489 uint8_t* RuleBasedCollator::cloneRuleData(int32_t &length, 490 UErrorCode &status) 491 { 492 return ucol_cloneRuleData(ucollator, &length, &status); 493 } 494 495 496 int32_t RuleBasedCollator::cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) 497 { 498 return ucol_cloneBinary(ucollator, buffer, capacity, &status); 499 } 500 501 void RuleBasedCollator::setAttribute(UColAttribute attr, 502 UColAttributeValue value, 503 UErrorCode &status) 504 { 505 if (U_FAILURE(status)) 506 return; 507 checkOwned(); 508 ucol_setAttribute(ucollator, attr, value, &status); 509 } 510 511 UColAttributeValue RuleBasedCollator::getAttribute(UColAttribute attr, 512 UErrorCode &status) 513 { 514 if (U_FAILURE(status)) 515 return UCOL_DEFAULT; 516 return ucol_getAttribute(ucollator, attr, &status); 517 } 518 519 uint32_t RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status) { 520 checkOwned(); 521 return ucol_setVariableTop(ucollator, varTop, len, &status); 522 } 523 524 uint32_t RuleBasedCollator::setVariableTop(const UnicodeString varTop, UErrorCode &status) { 525 checkOwned(); 526 return ucol_setVariableTop(ucollator, varTop.getBuffer(), varTop.length(), &status); 527 } 528 529 void RuleBasedCollator::setVariableTop(const uint32_t varTop, UErrorCode &status) { 530 checkOwned(); 531 ucol_restoreVariableTop(ucollator, varTop, &status); 532 } 533 534 uint32_t RuleBasedCollator::getVariableTop(UErrorCode &status) const { 535 return ucol_getVariableTop(ucollator, &status); 536 } 537 538 Collator* RuleBasedCollator::safeClone(void) 539 { 540 UErrorCode intStatus = U_ZERO_ERROR; 541 int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE; 542 UCollator *ucol = ucol_safeClone(ucollator, NULL, &buffersize, 543 &intStatus); 544 if (U_FAILURE(intStatus)) { 545 return NULL; 546 } 547 548 RuleBasedCollator *result = new RuleBasedCollator(); 549 // Null pointer check 550 if (result != NULL) { 551 result->ucollator = ucol; 552 result->dataIsOwned = TRUE; 553 result->isWriteThroughAlias = FALSE; 554 setRuleStringFromCollator(); 555 } 556 557 return result; 558 } 559 560 561 int32_t RuleBasedCollator::getSortKey(const UnicodeString& source, 562 uint8_t *result, int32_t resultLength) 563 const 564 { 565 return ucol_getSortKey(ucollator, source.getBuffer(), source.length(), result, resultLength); 566 } 567 568 int32_t RuleBasedCollator::getSortKey(const UChar *source, 569 int32_t sourceLength, uint8_t *result, 570 int32_t resultLength) const 571 { 572 return ucol_getSortKey(ucollator, source, sourceLength, result, resultLength); 573 } 574 575 Collator::ECollationStrength RuleBasedCollator::getStrength(void) const 576 { 577 UErrorCode intStatus = U_ZERO_ERROR; 578 return getECollationStrength(ucol_getAttribute(ucollator, UCOL_STRENGTH, 579 &intStatus)); 580 } 581 582 void RuleBasedCollator::setStrength(ECollationStrength newStrength) 583 { 584 checkOwned(); 585 UErrorCode intStatus = U_ZERO_ERROR; 586 UCollationStrength strength = getUCollationStrength(newStrength); 587 ucol_setAttribute(ucollator, UCOL_STRENGTH, strength, &intStatus); 588 } 589 590 int32_t RuleBasedCollator::getReorderCodes(int32_t *dest, 591 int32_t destCapacity, 592 UErrorCode& status) const 593 { 594 return ucol_getReorderCodes(ucollator, dest, destCapacity, &status); 595 } 596 597 void RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes, 598 int32_t reorderCodesLength, 599 UErrorCode& status) 600 { 601 ucol_setReorderCodes(ucollator, reorderCodes, reorderCodesLength, &status); 602 } 603 604 605 /** 606 * Create a hash code for this collation. Just hash the main rule table -- that 607 * should be good enough for almost any use. 608 */ 609 int32_t RuleBasedCollator::hashCode() const 610 { 611 int32_t length; 612 const UChar *rules = ucol_getRules(ucollator, &length); 613 return uhash_hashUCharsN(rules, length); 614 } 615 616 /** 617 * return the locale of this collator 618 */ 619 const Locale RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode &status) const { 620 const char *result = ucol_getLocaleByType(ucollator, type, &status); 621 if(result == NULL) { 622 Locale res(""); 623 res.setToBogus(); 624 return res; 625 } else { 626 return Locale(result); 627 } 628 } 629 630 void 631 RuleBasedCollator::setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale) { 632 checkOwned(); 633 char* rloc = uprv_strdup(requestedLocale.getName()); 634 if (rloc) { 635 char* vloc = uprv_strdup(validLocale.getName()); 636 if (vloc) { 637 char* aloc = uprv_strdup(actualLocale.getName()); 638 if (aloc) { 639 ucol_setReqValidLocales(ucollator, rloc, vloc, aloc); 640 return; 641 } 642 uprv_free(vloc); 643 } 644 uprv_free(rloc); 645 } 646 } 647 648 // RuleBaseCollatorNew private constructor ---------------------------------- 649 650 RuleBasedCollator::RuleBasedCollator() 651 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL) 652 { 653 } 654 655 RuleBasedCollator::RuleBasedCollator(const Locale& desiredLocale, 656 UErrorCode& status) 657 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL) 658 { 659 if (U_FAILURE(status)) 660 return; 661 662 /* 663 Try to load, in order: 664 1. The desired locale's collation. 665 2. A fallback of the desired locale. 666 3. The default locale's collation. 667 4. A fallback of the default locale. 668 5. The default collation rules, which contains en_US collation rules. 669 670 To reiterate, we try: 671 Specific: 672 language+country+variant 673 language+country 674 language 675 Default: 676 language+country+variant 677 language+country 678 language 679 Root: (aka DEFAULTRULES) 680 steps 1-5 are handled by resource bundle fallback mechanism. 681 however, in a very unprobable situation that no resource bundle 682 data exists, step 5 is repeated with hardcoded default rules. 683 */ 684 685 setUCollator(desiredLocale, status); 686 687 if (U_FAILURE(status)) 688 { 689 status = U_ZERO_ERROR; 690 691 setUCollator(kRootLocaleName, status); 692 if (status == U_ZERO_ERROR) { 693 status = U_USING_DEFAULT_WARNING; 694 } 695 } 696 697 if (U_SUCCESS(status)) 698 { 699 setRuleStringFromCollator(); 700 } 701 } 702 703 void 704 RuleBasedCollator::setUCollator(const char *locale, 705 UErrorCode &status) 706 { 707 if (U_FAILURE(status)) 708 return; 709 if (ucollator && dataIsOwned) 710 ucol_close(ucollator); 711 ucollator = ucol_open_internal(locale, &status); 712 dataIsOwned = TRUE; 713 isWriteThroughAlias = FALSE; 714 } 715 716 717 void 718 RuleBasedCollator::checkOwned() { 719 if (!(dataIsOwned || isWriteThroughAlias)) { 720 UErrorCode status = U_ZERO_ERROR; 721 ucollator = ucol_safeClone(ucollator, NULL, NULL, &status); 722 setRuleStringFromCollator(); 723 dataIsOwned = TRUE; 724 isWriteThroughAlias = FALSE; 725 } 726 } 727 728 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator) 729 730 U_NAMESPACE_END 731 732 #endif /* #if !UCONFIG_NO_COLLATION */ 733