1 /* 2 ****************************************************************************** 3 * Copyright (C) 1996-2009, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ****************************************************************************** 6 */ 7 8 /** 9 * File tblcoll.cpp 10 * 11 * Created by: Helena Shih 12 * 13 * Modification History: 14 * 15 * Date Name Description 16 * 2/5/97 aliu Added streamIn and streamOut methods. Added 17 * constructor which reads RuleBasedCollator object from 18 * a binary file. Added writeToFile method which streams 19 * RuleBasedCollator out to a binary file. The streamIn 20 * and streamOut methods use istream and ostream objects 21 * in binary mode. 22 * 2/11/97 aliu Moved declarations out of for loop initializer. 23 * Added Mac compatibility #ifdef for ios::nocreate. 24 * 2/12/97 aliu Modified to use TableCollationData sub-object to 25 * hold invariant data. 26 * 2/13/97 aliu Moved several methods into this class from Collation. 27 * Added a private RuleBasedCollator(Locale&) constructor, 28 * to be used by Collator::getInstance(). General 29 * clean up. Made use of UErrorCode variables consistent. 30 * 2/20/97 helena Added clone, operator==, operator!=, operator=, and copy 31 * constructor and getDynamicClassID. 32 * 3/5/97 aliu Changed compaction cycle to improve performance. We 33 * use the maximum allowable value which is kBlockCount. 34 * Modified getRules() to load rules dynamically. Changed 35 * constructFromFile() call to accomodate this (added 36 * parameter to specify whether binary loading is to 37 * take place). 38 * 05/06/97 helena Added memory allocation error check. 39 * 6/20/97 helena Java class name change. 40 * 6/23/97 helena Adding comments to make code more readable. 41 * 09/03/97 helena Added createCollationKeyValues(). 42 * 06/26/98 erm Changes for CollationKeys using byte arrays. 43 * 08/10/98 erm Synched with 1.2 version of RuleBasedCollator.java 44 * 04/23/99 stephen Removed EDecompositionMode, merged with 45 * Normalizer::EMode 46 * 06/14/99 stephen Removed kResourceBundleSuffix 47 * 06/22/99 stephen Fixed logic in constructFromFile() since .ctx 48 * files are no longer used. 49 * 11/02/99 helena Collator performance enhancements. Special case 50 * for NO_OP situations. 51 * 11/17/99 srl More performance enhancements. Inlined some internal functions. 52 * 12/15/99 aliu Update to support Thai collation. Move NormalizerIterator 53 * to implementation file. 54 * 01/29/01 synwee Modified into a C++ wrapper calling C APIs (ucol.h) 55 */ 56 57 #include "unicode/utypes.h" 58 59 #if !UCONFIG_NO_COLLATION 60 61 #include "unicode/tblcoll.h" 62 #include "unicode/coleitr.h" 63 #include "unicode/ures.h" 64 #include "unicode/uset.h" 65 #include "ucol_imp.h" 66 #include "uresimp.h" 67 #include "uhash.h" 68 #include "cmemory.h" 69 #include "cstring.h" 70 #include "putilimp.h" 71 72 /* public RuleBasedCollator constructor ---------------------------------- */ 73 74 U_NAMESPACE_BEGIN 75 76 /** 77 * Copy constructor, aliasing, not write-through 78 */ 79 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that) 80 : Collator(that) 81 , dataIsOwned(FALSE) 82 , isWriteThroughAlias(FALSE) 83 , ucollator(NULL) 84 { 85 RuleBasedCollator::operator=(that); 86 } 87 88 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 89 UErrorCode& status) : 90 dataIsOwned(FALSE) 91 { 92 construct(rules, 93 UCOL_DEFAULT_STRENGTH, 94 UCOL_DEFAULT, 95 status); 96 } 97 98 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 99 ECollationStrength collationStrength, 100 UErrorCode& status) : dataIsOwned(FALSE) 101 { 102 construct(rules, 103 getUCollationStrength(collationStrength), 104 UCOL_DEFAULT, 105 status); 106 } 107 108 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 109 UColAttributeValue decompositionMode, 110 UErrorCode& status) : 111 dataIsOwned(FALSE) 112 { 113 construct(rules, 114 UCOL_DEFAULT_STRENGTH, 115 decompositionMode, 116 status); 117 } 118 119 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules, 120 ECollationStrength collationStrength, 121 UColAttributeValue decompositionMode, 122 UErrorCode& status) : dataIsOwned(FALSE) 123 { 124 construct(rules, 125 getUCollationStrength(collationStrength), 126 decompositionMode, 127 status); 128 } 129 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length, 130 const RuleBasedCollator *base, 131 UErrorCode &status) : 132 dataIsOwned(TRUE), 133 isWriteThroughAlias(FALSE) 134 { 135 ucollator = ucol_openBinary(bin, length, base->ucollator, &status); 136 } 137 138 void 139 RuleBasedCollator::setRuleStringFromCollator() 140 { 141 int32_t length; 142 const UChar *r = ucol_getRules(ucollator, &length); 143 144 if (r && length > 0) { 145 // alias the rules string 146 urulestring.setTo(TRUE, r, length); 147 } 148 else { 149 urulestring.truncate(0); // Clear string. 150 } 151 } 152 153 // not aliasing, not write-through 154 void 155 RuleBasedCollator::construct(const UnicodeString& rules, 156 UColAttributeValue collationStrength, 157 UColAttributeValue decompositionMode, 158 UErrorCode& status) 159 { 160 ucollator = ucol_openRules(rules.getBuffer(), rules.length(), 161 decompositionMode, collationStrength, 162 NULL, &status); 163 164 dataIsOwned = TRUE; // since we own a collator now, we need to get rid of it 165 isWriteThroughAlias = FALSE; 166 167 if(ucollator == NULL) { 168 if(U_SUCCESS(status)) { 169 status = U_MEMORY_ALLOCATION_ERROR; 170 } 171 return; // Failure 172 } 173 174 setRuleStringFromCollator(); 175 } 176 177 /* RuleBasedCollator public destructor ----------------------------------- */ 178 179 RuleBasedCollator::~RuleBasedCollator() 180 { 181 if (dataIsOwned) 182 { 183 ucol_close(ucollator); 184 } 185 ucollator = 0; 186 } 187 188 /* RuleBaseCollator public methods --------------------------------------- */ 189 190 UBool RuleBasedCollator::operator==(const Collator& that) const 191 { 192 /* only checks for address equals here */ 193 if (Collator::operator==(that)) 194 return TRUE; 195 196 if (getDynamicClassID() != that.getDynamicClassID()) 197 return FALSE; /* not the same class */ 198 199 RuleBasedCollator& thatAlias = (RuleBasedCollator&)that; 200 201 // weiv: use C function, commented code below is wrong 202 return ucol_equals(this->ucollator, thatAlias.ucollator); 203 /* 204 synwee : orginal code does not check for data compatibility 205 */ 206 /* 207 if (ucollator != thatAlias.ucollator) 208 return FALSE; 209 210 return TRUE; 211 */ 212 } 213 214 UBool RuleBasedCollator::operator!=(const Collator& other) const 215 { 216 return !(*this == other); 217 } 218 219 // aliasing, not write-through 220 RuleBasedCollator& RuleBasedCollator::operator=(const RuleBasedCollator& that) 221 { 222 if (this != &that) 223 { 224 if (dataIsOwned) 225 { 226 ucol_close(ucollator); 227 } 228 229 urulestring.truncate(0); // empty the rule string 230 dataIsOwned = TRUE; 231 isWriteThroughAlias = FALSE; 232 233 UErrorCode intStatus = U_ZERO_ERROR; 234 int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE; 235 ucollator = ucol_safeClone(that.ucollator, NULL, &buffersize, 236 &intStatus); 237 if (U_SUCCESS(intStatus)) { 238 setRuleStringFromCollator(); 239 } 240 } 241 return *this; 242 } 243 244 // aliasing, not write-through 245 Collator* RuleBasedCollator::clone() const 246 { 247 return new RuleBasedCollator(*this); 248 } 249 250 CollationElementIterator* RuleBasedCollator::createCollationElementIterator 251 (const UnicodeString& source) const 252 { 253 UErrorCode status = U_ZERO_ERROR; 254 CollationElementIterator *result = new CollationElementIterator(source, this, 255 status); 256 if (U_FAILURE(status)) { 257 delete result; 258 return NULL; 259 } 260 261 return result; 262 } 263 264 /** 265 * Create a CollationElementIterator object that will iterate over the 266 * elements in a string, using the collation rules defined in this 267 * RuleBasedCollator 268 */ 269 CollationElementIterator* RuleBasedCollator::createCollationElementIterator 270 (const CharacterIterator& source) const 271 { 272 UErrorCode status = U_ZERO_ERROR; 273 CollationElementIterator *result = new CollationElementIterator(source, this, 274 status); 275 276 if (U_FAILURE(status)) { 277 delete result; 278 return NULL; 279 } 280 281 return result; 282 } 283 284 /** 285 * Return a string representation of this collator's rules. The string can 286 * later be passed to the constructor that takes a UnicodeString argument, 287 * which will construct a collator that's functionally identical to this one. 288 * You can also allow users to edit the string in order to change the collation 289 * data, or you can print it out for inspection, or whatever. 290 */ 291 const UnicodeString& RuleBasedCollator::getRules() const 292 { 293 return urulestring; 294 } 295 296 void RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer) 297 { 298 int32_t rulesize = ucol_getRulesEx(ucollator, delta, NULL, -1); 299 300 if (rulesize > 0) { 301 UChar *rules = (UChar*) uprv_malloc( sizeof(UChar) * (rulesize) ); 302 if(rules != NULL) { 303 ucol_getRulesEx(ucollator, delta, rules, rulesize); 304 buffer.setTo(rules, rulesize); 305 uprv_free(rules); 306 } else { // couldn't allocate 307 buffer.remove(); 308 } 309 } 310 else { 311 buffer.remove(); 312 } 313 } 314 315 UnicodeSet * 316 RuleBasedCollator::getTailoredSet(UErrorCode &status) const 317 { 318 if(U_FAILURE(status)) { 319 return NULL; 320 } 321 return (UnicodeSet *)ucol_getTailoredSet(this->ucollator, &status); 322 } 323 324 325 void RuleBasedCollator::getVersion(UVersionInfo versionInfo) const 326 { 327 if (versionInfo!=NULL){ 328 ucol_getVersion(ucollator, versionInfo); 329 } 330 } 331 332 Collator::EComparisonResult RuleBasedCollator::compare( 333 const UnicodeString& source, 334 const UnicodeString& target, 335 int32_t length) const 336 { 337 UErrorCode status = U_ZERO_ERROR; 338 return getEComparisonResult(compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status)); 339 } 340 341 UCollationResult RuleBasedCollator::compare( 342 const UnicodeString& source, 343 const UnicodeString& target, 344 int32_t length, 345 UErrorCode &status) const 346 { 347 return compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status); 348 } 349 350 Collator::EComparisonResult RuleBasedCollator::compare(const UChar* source, 351 int32_t sourceLength, 352 const UChar* target, 353 int32_t targetLength) 354 const 355 { 356 return getEComparisonResult(ucol_strcoll(ucollator, source, sourceLength, 357 target, targetLength)); 358 } 359 360 UCollationResult RuleBasedCollator::compare(const UChar* source, 361 int32_t sourceLength, 362 const UChar* target, 363 int32_t targetLength, 364 UErrorCode &status) const 365 { 366 if(U_SUCCESS(status)) { 367 return ucol_strcoll(ucollator, source, sourceLength, target, targetLength); 368 } else { 369 return UCOL_EQUAL; 370 } 371 } 372 373 /** 374 * Compare two strings using this collator 375 */ 376 Collator::EComparisonResult RuleBasedCollator::compare( 377 const UnicodeString& source, 378 const UnicodeString& target) const 379 { 380 return getEComparisonResult(ucol_strcoll(ucollator, source.getBuffer(), source.length(), 381 target.getBuffer(), target.length())); 382 } 383 384 UCollationResult RuleBasedCollator::compare( 385 const UnicodeString& source, 386 const UnicodeString& target, 387 UErrorCode &status) const 388 { 389 if(U_SUCCESS(status)) { 390 return ucol_strcoll(ucollator, source.getBuffer(), source.length(), 391 target.getBuffer(), target.length()); 392 } else { 393 return UCOL_EQUAL; 394 } 395 } 396 397 UCollationResult RuleBasedCollator::compare(UCharIterator &sIter, 398 UCharIterator &tIter, 399 UErrorCode &status) const { 400 if(U_SUCCESS(status)) { 401 return ucol_strcollIter(ucollator, &sIter, &tIter, &status); 402 } else { 403 return UCOL_EQUAL; 404 } 405 } 406 407 /** 408 * Retrieve a collation key for the specified string. The key can be compared 409 * with other collation keys using a bitwise comparison (e.g. memcmp) to find 410 * the ordering of their respective source strings. This is handy when doing a 411 * sort, where each sort key must be compared many times. 412 * 413 * The basic algorithm here is to find all of the collation elements for each 414 * character in the source string, convert them to an ASCII representation, and 415 * put them into the collation key. But it's trickier than that. Each 416 * collation element in a string has three components: primary ('A' vs 'B'), 417 * secondary ('u' vs '\u00FC'), and tertiary ('A' vs 'a'), and a primary difference 418 * at the end of a string takes precedence over a secondary or tertiary 419 * difference earlier in the string. 420 * 421 * To account for this, we put all of the primary orders at the beginning of 422 * the string, followed by the secondary and tertiary orders. Each set of 423 * orders is terminated by nulls so that a key for a string which is a initial 424 * substring of another key will compare less without any special case. 425 * 426 * Here's a hypothetical example, with the collation element represented as a 427 * three-digit number, one digit for primary, one for secondary, etc. 428 * 429 * String: A a B \u00C9 430 * Collation Elements: 101 100 201 511 431 * Collation Key: 1125<null>0001<null>1011<null> 432 * 433 * To make things even trickier, secondary differences (accent marks) are 434 * compared starting at the *end* of the string in languages with French 435 * secondary ordering. But when comparing the accent marks on a single base 436 * character, they are compared from the beginning. To handle this, we reverse 437 * all of the accents that belong to each base character, then we reverse the 438 * entire string of secondary orderings at the end. 439 */ 440 CollationKey& RuleBasedCollator::getCollationKey( 441 const UnicodeString& source, 442 CollationKey& sortkey, 443 UErrorCode& status) const 444 { 445 return getCollationKey(source.getBuffer(), source.length(), sortkey, status); 446 } 447 448 CollationKey& RuleBasedCollator::getCollationKey(const UChar* source, 449 int32_t sourceLen, 450 CollationKey& sortkey, 451 UErrorCode& status) const 452 { 453 if (U_FAILURE(status)) 454 { 455 return sortkey.setToBogus(); 456 } 457 458 if ((!source) || (sourceLen == 0)) { 459 return sortkey.reset(); 460 } 461 462 uint8_t *result; 463 int32_t resultLen = ucol_getSortKeyWithAllocation(ucollator, 464 source, sourceLen, 465 &result, 466 &status); 467 sortkey.adopt(result, resultLen); 468 return sortkey; 469 } 470 471 /** 472 * Return the maximum length of any expansion sequences that end with the 473 * specified comparison order. 474 * @param order a collation order returned by previous or next. 475 * @return the maximum length of any expansion seuences ending with the 476 * specified order or 1 if collation order does not occur at the end of any 477 * expansion sequence. 478 * @see CollationElementIterator#getMaxExpansion 479 */ 480 int32_t RuleBasedCollator::getMaxExpansion(int32_t order) const 481 { 482 uint8_t result; 483 UCOL_GETMAXEXPANSION(ucollator, (uint32_t)order, result); 484 return result; 485 } 486 487 uint8_t* RuleBasedCollator::cloneRuleData(int32_t &length, 488 UErrorCode &status) 489 { 490 return ucol_cloneRuleData(ucollator, &length, &status); 491 } 492 493 494 int32_t RuleBasedCollator::cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status) 495 { 496 return ucol_cloneBinary(ucollator, buffer, capacity, &status); 497 } 498 499 void RuleBasedCollator::setAttribute(UColAttribute attr, 500 UColAttributeValue value, 501 UErrorCode &status) 502 { 503 if (U_FAILURE(status)) 504 return; 505 checkOwned(); 506 ucol_setAttribute(ucollator, attr, value, &status); 507 } 508 509 UColAttributeValue RuleBasedCollator::getAttribute(UColAttribute attr, 510 UErrorCode &status) 511 { 512 if (U_FAILURE(status)) 513 return UCOL_DEFAULT; 514 return ucol_getAttribute(ucollator, attr, &status); 515 } 516 517 uint32_t RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status) { 518 checkOwned(); 519 return ucol_setVariableTop(ucollator, varTop, len, &status); 520 } 521 522 uint32_t RuleBasedCollator::setVariableTop(const UnicodeString varTop, UErrorCode &status) { 523 checkOwned(); 524 return ucol_setVariableTop(ucollator, varTop.getBuffer(), varTop.length(), &status); 525 } 526 527 void RuleBasedCollator::setVariableTop(const uint32_t varTop, UErrorCode &status) { 528 checkOwned(); 529 ucol_restoreVariableTop(ucollator, varTop, &status); 530 } 531 532 uint32_t RuleBasedCollator::getVariableTop(UErrorCode &status) const { 533 return ucol_getVariableTop(ucollator, &status); 534 } 535 536 Collator* RuleBasedCollator::safeClone(void) 537 { 538 UErrorCode intStatus = U_ZERO_ERROR; 539 int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE; 540 UCollator *ucol = ucol_safeClone(ucollator, NULL, &buffersize, 541 &intStatus); 542 if (U_FAILURE(intStatus)) { 543 return NULL; 544 } 545 546 RuleBasedCollator *result = new RuleBasedCollator(); 547 // Null pointer check 548 if (result != NULL) { 549 result->ucollator = ucol; 550 result->dataIsOwned = TRUE; 551 result->isWriteThroughAlias = FALSE; 552 setRuleStringFromCollator(); 553 } 554 555 return result; 556 } 557 558 559 int32_t RuleBasedCollator::getSortKey(const UnicodeString& source, 560 uint8_t *result, int32_t resultLength) 561 const 562 { 563 return ucol_getSortKey(ucollator, source.getBuffer(), source.length(), result, resultLength); 564 } 565 566 int32_t RuleBasedCollator::getSortKey(const UChar *source, 567 int32_t sourceLength, uint8_t *result, 568 int32_t resultLength) const 569 { 570 return ucol_getSortKey(ucollator, source, sourceLength, result, resultLength); 571 } 572 573 Collator::ECollationStrength RuleBasedCollator::getStrength(void) const 574 { 575 UErrorCode intStatus = U_ZERO_ERROR; 576 return getECollationStrength(ucol_getAttribute(ucollator, UCOL_STRENGTH, 577 &intStatus)); 578 } 579 580 void RuleBasedCollator::setStrength(ECollationStrength newStrength) 581 { 582 checkOwned(); 583 UErrorCode intStatus = U_ZERO_ERROR; 584 UCollationStrength strength = getUCollationStrength(newStrength); 585 ucol_setAttribute(ucollator, UCOL_STRENGTH, strength, &intStatus); 586 } 587 588 /** 589 * Create a hash code for this collation. Just hash the main rule table -- that 590 * should be good enough for almost any use. 591 */ 592 int32_t RuleBasedCollator::hashCode() const 593 { 594 int32_t length; 595 const UChar *rules = ucol_getRules(ucollator, &length); 596 return uhash_hashUCharsN(rules, length); 597 } 598 599 /** 600 * return the locale of this collator 601 */ 602 const Locale RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode &status) const { 603 const char *result = ucol_getLocaleByType(ucollator, type, &status); 604 if(result == NULL) { 605 Locale res(""); 606 res.setToBogus(); 607 return res; 608 } else { 609 return Locale(result); 610 } 611 } 612 613 void 614 RuleBasedCollator::setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale) { 615 checkOwned(); 616 char* rloc = uprv_strdup(requestedLocale.getName()); 617 if (rloc) { 618 char* vloc = uprv_strdup(validLocale.getName()); 619 if (vloc) { 620 char* aloc = uprv_strdup(actualLocale.getName()); 621 if (aloc) { 622 ucol_setReqValidLocales(ucollator, rloc, vloc, aloc); 623 return; 624 } 625 uprv_free(vloc); 626 } 627 uprv_free(rloc); 628 } 629 } 630 631 // RuleBaseCollatorNew private constructor ---------------------------------- 632 633 RuleBasedCollator::RuleBasedCollator() 634 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL) 635 { 636 } 637 638 RuleBasedCollator::RuleBasedCollator(const Locale& desiredLocale, 639 UErrorCode& status) 640 : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL) 641 { 642 if (U_FAILURE(status)) 643 return; 644 645 /* 646 Try to load, in order: 647 1. The desired locale's collation. 648 2. A fallback of the desired locale. 649 3. The default locale's collation. 650 4. A fallback of the default locale. 651 5. The default collation rules, which contains en_US collation rules. 652 653 To reiterate, we try: 654 Specific: 655 language+country+variant 656 language+country 657 language 658 Default: 659 language+country+variant 660 language+country 661 language 662 Root: (aka DEFAULTRULES) 663 steps 1-5 are handled by resource bundle fallback mechanism. 664 however, in a very unprobable situation that no resource bundle 665 data exists, step 5 is repeated with hardcoded default rules. 666 */ 667 668 setUCollator(desiredLocale, status); 669 670 if (U_FAILURE(status)) 671 { 672 status = U_ZERO_ERROR; 673 674 setUCollator(kRootLocaleName, status); 675 if (status == U_ZERO_ERROR) { 676 status = U_USING_DEFAULT_WARNING; 677 } 678 } 679 680 if (U_SUCCESS(status)) 681 { 682 setRuleStringFromCollator(); 683 } 684 } 685 686 void 687 RuleBasedCollator::setUCollator(const char *locale, 688 UErrorCode &status) 689 { 690 if (U_FAILURE(status)) 691 return; 692 if (ucollator && dataIsOwned) 693 ucol_close(ucollator); 694 ucollator = ucol_open_internal(locale, &status); 695 dataIsOwned = TRUE; 696 isWriteThroughAlias = FALSE; 697 } 698 699 700 void 701 RuleBasedCollator::checkOwned() { 702 if (!(dataIsOwned || isWriteThroughAlias)) { 703 UErrorCode status = U_ZERO_ERROR; 704 ucollator = ucol_safeClone(ucollator, NULL, NULL, &status); 705 setRuleStringFromCollator(); 706 dataIsOwned = TRUE; 707 isWriteThroughAlias = FALSE; 708 } 709 } 710 711 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedCollator) 712 713 U_NAMESPACE_END 714 715 #endif /* #if !UCONFIG_NO_COLLATION */ 716