Home | History | Annotate | Download | only in i18n
      1 /*
      2  ******************************************************************************
      3  * Copyright (C) 1996-2012, International Business Machines Corporation and
      4  * others. All Rights Reserved.
      5  ******************************************************************************
      6  */
      8 /**
      9  * File tblcoll.cpp
     10  *
     11  * Created by: Helena Shih
     12  *
     13  * Modification History:
     14  *
     15  *  Date        Name        Description
     16  *  2/5/97      aliu        Added streamIn and streamOut methods.  Added
     17  *                          constructor which reads RuleBasedCollator object from
     18  *                          a binary file.  Added writeToFile method which streams
     19  *                          RuleBasedCollator out to a binary file.  The streamIn
     20  *                          and streamOut methods use istream and ostream objects
     21  *                          in binary mode.
     22  *  2/11/97     aliu        Moved declarations out of for loop initializer.
     23  *                          Added Mac compatibility #ifdef for ios::nocreate.
     24  *  2/12/97     aliu        Modified to use TableCollationData sub-object to
     25  *                          hold invariant data.
     26  *  2/13/97     aliu        Moved several methods into this class from Collation.
     27  *                          Added a private RuleBasedCollator(Locale&) constructor,
     28  *                          to be used by Collator::getInstance().  General
     29  *                          clean up.  Made use of UErrorCode variables consistent.
     30  *  2/20/97     helena      Added clone, operator==, operator!=, operator=, and copy
     31  *                          constructor and getDynamicClassID.
     32  *  3/5/97      aliu        Changed compaction cycle to improve performance.  We
     33  *                          use the maximum allowable value which is kBlockCount.
     34  *                          Modified getRules() to load rules dynamically.  Changed
     35  *                          constructFromFile() call to accomodate this (added
     36  *                          parameter to specify whether binary loading is to
     37  *                          take place).
     38  * 05/06/97     helena      Added memory allocation error check.
     39  *  6/20/97     helena      Java class name change.
     40  *  6/23/97     helena      Adding comments to make code more readable.
     41  * 09/03/97     helena      Added createCollationKeyValues().
     42  * 06/26/98     erm         Changes for CollationKeys using byte arrays.
     43  * 08/10/98     erm         Synched with 1.2 version of RuleBasedCollator.java
     44  * 04/23/99     stephen     Removed EDecompositionMode, merged with
     45  *                          Normalizer::EMode
     46  * 06/14/99     stephen     Removed kResourceBundleSuffix
     47  * 06/22/99     stephen     Fixed logic in constructFromFile() since .ctx
     48  *                          files are no longer used.
     49  * 11/02/99     helena      Collator performance enhancements.  Special case
     50  *                          for NO_OP situations.
     51  * 11/17/99     srl         More performance enhancements. Inlined some internal functions.
     52  * 12/15/99     aliu        Update to support Thai collation.  Move NormalizerIterator
     53  *                          to implementation file.
     54  * 01/29/01     synwee      Modified into a C++ wrapper calling C APIs (ucol.h)
     55  */
     57 #include "unicode/utypes.h"
     61 #include "unicode/tblcoll.h"
     62 #include "unicode/coleitr.h"
     63 #include "unicode/ures.h"
     64 #include "unicode/uset.h"
     65 #include "ucol_imp.h"
     66 #include "uresimp.h"
     67 #include "uhash.h"
     68 #include "cmemory.h"
     69 #include "cstring.h"
     70 #include "putilimp.h"
     71 #include "ustr_imp.h"
     73 /* public RuleBasedCollator constructor ---------------------------------- */
     77 /**
     78 * Copy constructor, aliasing, not write-through
     79 */
     80 RuleBasedCollator::RuleBasedCollator(const RuleBasedCollator& that)
     81 : Collator(that)
     82 , dataIsOwned(FALSE)
     83 , isWriteThroughAlias(FALSE)
     84 , ucollator(NULL)
     85 {
     86     RuleBasedCollator::operator=(that);
     87 }
     89 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
     90                                      UErrorCode& status) :
     91 dataIsOwned(FALSE)
     92 {
     93     construct(rules,
     94         UCOL_DEFAULT_STRENGTH,
     95         UCOL_DEFAULT,
     96         status);
     97 }
     99 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
    100                                      ECollationStrength collationStrength,
    101                                      UErrorCode& status) : dataIsOwned(FALSE)
    102 {
    103     construct(rules,
    104         (UColAttributeValue)collationStrength,
    105         UCOL_DEFAULT,
    106         status);
    107 }
    109 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
    110                                      UColAttributeValue decompositionMode,
    111                                      UErrorCode& status) :
    112 dataIsOwned(FALSE)
    113 {
    114     construct(rules,
    115         UCOL_DEFAULT_STRENGTH,
    116         decompositionMode,
    117         status);
    118 }
    120 RuleBasedCollator::RuleBasedCollator(const UnicodeString& rules,
    121                                      ECollationStrength collationStrength,
    122                                      UColAttributeValue decompositionMode,
    123                                      UErrorCode& status) : dataIsOwned(FALSE)
    124 {
    125     construct(rules,
    126         (UColAttributeValue)collationStrength,
    127         decompositionMode,
    128         status);
    129 }
    130 RuleBasedCollator::RuleBasedCollator(const uint8_t *bin, int32_t length,
    131                     const RuleBasedCollator *base,
    132                     UErrorCode &status) :
    133 dataIsOwned(TRUE),
    134 isWriteThroughAlias(FALSE)
    135 {
    136   ucollator = ucol_openBinary(bin, length, base->ucollator, &status);
    137 }
    139 void
    140 RuleBasedCollator::setRuleStringFromCollator()
    141 {
    142     int32_t length;
    143     const UChar *r = ucol_getRules(ucollator, &length);
    145     if (r && length > 0) {
    146         // alias the rules string
    147         urulestring.setTo(TRUE, r, length);
    148     }
    149     else {
    150         urulestring.truncate(0); // Clear string.
    151     }
    152 }
    154 // not aliasing, not write-through
    155 void
    156 RuleBasedCollator::construct(const UnicodeString& rules,
    157                              UColAttributeValue collationStrength,
    158                              UColAttributeValue decompositionMode,
    159                              UErrorCode& status)
    160 {
    161     ucollator = ucol_openRules(rules.getBuffer(), rules.length(),
    162         decompositionMode, collationStrength,
    163         NULL, &status);
    165     dataIsOwned = TRUE; // since we own a collator now, we need to get rid of it
    166     isWriteThroughAlias = FALSE;
    168     if(ucollator == NULL) {
    169         if(U_SUCCESS(status)) {
    170             status = U_MEMORY_ALLOCATION_ERROR;
    171         }
    172         return; // Failure
    173     }
    175     setRuleStringFromCollator();
    176 }
    178 /* RuleBasedCollator public destructor ----------------------------------- */
    180 RuleBasedCollator::~RuleBasedCollator()
    181 {
    182     if (dataIsOwned)
    183     {
    184         ucol_close(ucollator);
    185     }
    186     ucollator = 0;
    187 }
    189 /* RuleBaseCollator public methods --------------------------------------- */
    191 UBool RuleBasedCollator::operator==(const Collator& that) const
    192 {
    193   /* only checks for address equals here */
    194   if (this == &that) {
    195     return TRUE;
    196   }
    197   if (!Collator::operator==(that)) {
    198     return FALSE;  /* not the same class */
    199   }
    201   RuleBasedCollator& thatAlias = (RuleBasedCollator&)that;
    203   return ucol_equals(this->ucollator, thatAlias.ucollator);
    204 }
    206 // aliasing, not write-through
    207 RuleBasedCollator& RuleBasedCollator::operator=(const RuleBasedCollator& that)
    208 {
    209     if (this == &that) { return *this; }
    211     UErrorCode intStatus = U_ZERO_ERROR;
    212     int32_t buffersize = U_COL_SAFECLONE_BUFFERSIZE;
    213     UCollator *ucol = ucol_safeClone(that.ucollator, NULL, &buffersize, &intStatus);
    214     if (U_FAILURE(intStatus)) { return *this; }
    216     if (dataIsOwned) {
    217         ucol_close(ucollator);
    218     }
    219     ucollator = ucol;
    220     dataIsOwned = TRUE;
    221     isWriteThroughAlias = FALSE;
    222     setRuleStringFromCollator();
    223     return *this;
    224 }
    226 // aliasing, not write-through
    227 Collator* RuleBasedCollator::clone() const
    228 {
    229     RuleBasedCollator* coll = new RuleBasedCollator(*this);
    230     // There is a small chance that the internal ucol_safeClone() call fails.
    231     if (coll != NULL && coll->ucollator == NULL) {
    232         delete coll;
    233         return NULL;
    234     }
    235     return coll;
    236 }
    239 CollationElementIterator* RuleBasedCollator::createCollationElementIterator
    240                                            (const UnicodeString& source) const
    241 {
    242     UErrorCode status = U_ZERO_ERROR;
    243     CollationElementIterator *result = new CollationElementIterator(source, this,
    244                                                                     status);
    245     if (U_FAILURE(status)) {
    246         delete result;
    247         return NULL;
    248     }
    250     return result;
    251 }
    253 /**
    254 * Create a CollationElementIterator object that will iterate over the
    255 * elements in a string, using the collation rules defined in this
    256 * RuleBasedCollator
    257 */
    258 CollationElementIterator* RuleBasedCollator::createCollationElementIterator
    259                                        (const CharacterIterator& source) const
    260 {
    261     UErrorCode status = U_ZERO_ERROR;
    262     CollationElementIterator *result = new CollationElementIterator(source, this,
    263                                                                     status);
    265     if (U_FAILURE(status)) {
    266         delete result;
    267         return NULL;
    268     }
    270     return result;
    271 }
    273 /**
    274 * Return a string representation of this collator's rules. The string can
    275 * later be passed to the constructor that takes a UnicodeString argument,
    276 * which will construct a collator that's functionally identical to this one.
    277 * You can also allow users to edit the string in order to change the collation
    278 * data, or you can print it out for inspection, or whatever.
    279 */
    280 const UnicodeString& RuleBasedCollator::getRules() const
    281 {
    282     return urulestring;
    283 }
    285 void RuleBasedCollator::getRules(UColRuleOption delta, UnicodeString &buffer)
    286 {
    287     int32_t rulesize = ucol_getRulesEx(ucollator, delta, NULL, -1);
    289     if (rulesize > 0) {
    290         UChar *rules = (UChar*) uprv_malloc( sizeof(UChar) * (rulesize) );
    291         if(rules != NULL) {
    292             ucol_getRulesEx(ucollator, delta, rules, rulesize);
    293             buffer.setTo(rules, rulesize);
    294             uprv_free(rules);
    295         } else { // couldn't allocate
    296             buffer.remove();
    297         }
    298     }
    299     else {
    300         buffer.remove();
    301     }
    302 }
    304 UnicodeSet *
    305 RuleBasedCollator::getTailoredSet(UErrorCode &status) const
    306 {
    307     if(U_FAILURE(status)) {
    308         return NULL;
    309     }
    310     return (UnicodeSet *)ucol_getTailoredSet(this->ucollator, &status);
    311 }
    314 void RuleBasedCollator::getVersion(UVersionInfo versionInfo) const
    315 {
    316     if (versionInfo!=NULL){
    317         ucol_getVersion(ucollator, versionInfo);
    318     }
    319 }
    321 /**
    322 * Compare two strings using this collator
    323 */
    324 UCollationResult RuleBasedCollator::compare(
    325                                                const UnicodeString& source,
    326                                                const UnicodeString& target,
    327                                                int32_t length,
    328                                                UErrorCode &status) const
    329 {
    330     return compare(source.getBuffer(), uprv_min(length,source.length()), target.getBuffer(), uprv_min(length,target.length()), status);
    331 }
    333 UCollationResult RuleBasedCollator::compare(const UChar* source,
    334                                                        int32_t sourceLength,
    335                                                        const UChar* target,
    336                                                        int32_t targetLength,
    337                                                        UErrorCode &status) const
    338 {
    339     if(U_SUCCESS(status)) {
    340         return  ucol_strcoll(ucollator, source, sourceLength, target, targetLength);
    341     } else {
    342         return UCOL_EQUAL;
    343     }
    344 }
    346 UCollationResult RuleBasedCollator::compare(
    347                                              const UnicodeString& source,
    348                                              const UnicodeString& target,
    349                                              UErrorCode &status) const
    350 {
    351     if(U_SUCCESS(status)) {
    352         return ucol_strcoll(ucollator, source.getBuffer(), source.length(),
    353                                        target.getBuffer(), target.length());
    354     } else {
    355         return UCOL_EQUAL;
    356     }
    357 }
    359 UCollationResult RuleBasedCollator::compare(UCharIterator &sIter,
    360                                             UCharIterator &tIter,
    361                                             UErrorCode &status) const {
    362     if(U_SUCCESS(status)) {
    363         return ucol_strcollIter(ucollator, &sIter, &tIter, &status);
    364     } else {
    365         return UCOL_EQUAL;
    366     }
    367 }
    369 /**
    370 * Retrieve a collation key for the specified string. The key can be compared
    371 * with other collation keys using a bitwise comparison (e.g. memcmp) to find
    372 * the ordering of their respective source strings. This is handy when doing a
    373 * sort, where each sort key must be compared many times.
    374 *
    375 * The basic algorithm here is to find all of the collation elements for each
    376 * character in the source string, convert them to an ASCII representation, and
    377 * put them into the collation key.  But it's trickier than that. Each
    378 * collation element in a string has three components: primary ('A' vs 'B'),
    379 * secondary ('u' vs '\u00FC'), and tertiary ('A' vs 'a'), and a primary difference
    380 * at the end of a string takes precedence over a secondary or tertiary
    381 * difference earlier in the string.
    382 *
    383 * To account for this, we put all of the primary orders at the beginning of
    384 * the string, followed by the secondary and tertiary orders. Each set of
    385 * orders is terminated by nulls so that a key for a string which is a initial
    386 * substring of another key will compare less without any special case.
    387 *
    388 * Here's a hypothetical example, with the collation element represented as a
    389 * three-digit number, one digit for primary, one for secondary, etc.
    390 *
    391 * String:              A     a     B    \u00C9
    392 * Collation Elements: 101   100   201  511
    393 * Collation Key:      1125<null>0001<null>1011<null>
    394 *
    395 * To make things even trickier, secondary differences (accent marks) are
    396 * compared starting at the *end* of the string in languages with French
    397 * secondary ordering. But when comparing the accent marks on a single base
    398 * character, they are compared from the beginning. To handle this, we reverse
    399 * all of the accents that belong to each base character, then we reverse the
    400 * entire string of secondary orderings at the end.
    401 */
    402 CollationKey& RuleBasedCollator::getCollationKey(
    403                                                   const UnicodeString& source,
    404                                                   CollationKey& sortkey,
    405                                                   UErrorCode& status) const
    406 {
    407     return getCollationKey(source.getBuffer(), source.length(), sortkey, status);
    408 }
    410 CollationKey& RuleBasedCollator::getCollationKey(const UChar* source,
    411                                                     int32_t sourceLen,
    412                                                     CollationKey& sortkey,
    413                                                     UErrorCode& status) const
    414 {
    415     if (U_FAILURE(status)) {
    416         return sortkey.setToBogus();
    417     }
    418     if (sourceLen < -1 || (source == NULL && sourceLen != 0)) {
    419         status = U_ILLEGAL_ARGUMENT_ERROR;
    420         return sortkey.setToBogus();
    421     }
    423     if (sourceLen < 0) {
    424         sourceLen = u_strlen(source);
    425     }
    426     if (sourceLen == 0) {
    427         return sortkey.reset();
    428     }
    430     int32_t resultLen = ucol_getCollationKey(ucollator, source, sourceLen, sortkey, status);
    432     if (U_SUCCESS(status)) {
    433         sortkey.setLength(resultLen);
    434     } else {
    435         sortkey.setToBogus();
    436     }
    437     return sortkey;
    438 }
    440 /**
    441  * Return the maximum length of any expansion sequences that end with the
    442  * specified comparison order.
    443  * @param order a collation order returned by previous or next.
    444  * @return the maximum length of any expansion seuences ending with the
    445  *         specified order or 1 if collation order does not occur at the end of any
    446  *         expansion sequence.
    447  * @see CollationElementIterator#getMaxExpansion
    448  */
    449 int32_t RuleBasedCollator::getMaxExpansion(int32_t order) const
    450 {
    451     uint8_t result;
    452     UCOL_GETMAXEXPANSION(ucollator, (uint32_t)order, result);
    453     return result;
    454 }
    456 uint8_t* RuleBasedCollator::cloneRuleData(int32_t &length,
    457                                               UErrorCode &status)
    458 {
    459     return ucol_cloneRuleData(ucollator, &length, &status);
    460 }
    463 int32_t RuleBasedCollator::cloneBinary(uint8_t *buffer, int32_t capacity, UErrorCode &status)
    464 {
    465   return ucol_cloneBinary(ucollator, buffer, capacity, &status);
    466 }
    468 void RuleBasedCollator::setAttribute(UColAttribute attr,
    469                                      UColAttributeValue value,
    470                                      UErrorCode &status)
    471 {
    472     if (U_FAILURE(status))
    473         return;
    474     checkOwned();
    475     ucol_setAttribute(ucollator, attr, value, &status);
    476 }
    478 UColAttributeValue RuleBasedCollator::getAttribute(UColAttribute attr,
    479                                                       UErrorCode &status) const
    480 {
    481     if (U_FAILURE(status))
    482         return UCOL_DEFAULT;
    483     return ucol_getAttribute(ucollator, attr, &status);
    484 }
    486 uint32_t RuleBasedCollator::setVariableTop(const UChar *varTop, int32_t len, UErrorCode &status) {
    487     checkOwned();
    488     return ucol_setVariableTop(ucollator, varTop, len, &status);
    489 }
    491 uint32_t RuleBasedCollator::setVariableTop(const UnicodeString &varTop, UErrorCode &status) {
    492     checkOwned();
    493     return ucol_setVariableTop(ucollator, varTop.getBuffer(), varTop.length(), &status);
    494 }
    496 void RuleBasedCollator::setVariableTop(uint32_t varTop, UErrorCode &status) {
    497     checkOwned();
    498     ucol_restoreVariableTop(ucollator, varTop, &status);
    499 }
    501 uint32_t RuleBasedCollator::getVariableTop(UErrorCode &status) const {
    502   return ucol_getVariableTop(ucollator, &status);
    503 }
    505 int32_t RuleBasedCollator::getSortKey(const UnicodeString& source,
    506                                          uint8_t *result, int32_t resultLength)
    507                                          const
    508 {
    509     return ucol_getSortKey(ucollator, source.getBuffer(), source.length(), result, resultLength);
    510 }
    512 int32_t RuleBasedCollator::getSortKey(const UChar *source,
    513                                          int32_t sourceLength, uint8_t *result,
    514                                          int32_t resultLength) const
    515 {
    516     return ucol_getSortKey(ucollator, source, sourceLength, result, resultLength);
    517 }
    519 int32_t RuleBasedCollator::getReorderCodes(int32_t *dest,
    520                                           int32_t destCapacity,
    521                                           UErrorCode& status) const
    522 {
    523     return ucol_getReorderCodes(ucollator, dest, destCapacity, &status);
    524 }
    526 void RuleBasedCollator::setReorderCodes(const int32_t *reorderCodes,
    527                                        int32_t reorderCodesLength,
    528                                        UErrorCode& status)
    529 {
    530     checkOwned();
    531     ucol_setReorderCodes(ucollator, reorderCodes, reorderCodesLength, &status);
    532 }
    534 int32_t RuleBasedCollator::getEquivalentReorderCodes(int32_t reorderCode,
    535                                 int32_t* dest,
    536                                 int32_t destCapacity,
    537                                 UErrorCode& status)
    538 {
    539     return ucol_getEquivalentReorderCodes(reorderCode, dest, destCapacity, &status);
    540 }
    542 /**
    543 * Create a hash code for this collation. Just hash the main rule table -- that
    544 * should be good enough for almost any use.
    545 */
    546 int32_t RuleBasedCollator::hashCode() const
    547 {
    548     int32_t length;
    549     const UChar *rules = ucol_getRules(ucollator, &length);
    550     return ustr_hashUCharsN(rules, length);
    551 }
    553 /**
    554 * return the locale of this collator
    555 */
    556 Locale RuleBasedCollator::getLocale(ULocDataLocaleType type, UErrorCode &status) const {
    557     const char *result = ucol_getLocaleByType(ucollator, type, &status);
    558     if(result == NULL) {
    559         Locale res("");
    560         res.setToBogus();
    561         return res;
    562     } else {
    563         return Locale(result);
    564     }
    565 }
    567 void
    568 RuleBasedCollator::setLocales(const Locale& requestedLocale, const Locale& validLocale, const Locale& actualLocale) {
    569     checkOwned();
    570     char* rloc  = uprv_strdup(requestedLocale.getName());
    571     if (rloc) {
    572         char* vloc = uprv_strdup(validLocale.getName());
    573         if (vloc) {
    574             char* aloc = uprv_strdup(actualLocale.getName());
    575             if (aloc) {
    576                 ucol_setReqValidLocales(ucollator, rloc, vloc, aloc);
    577                 return;
    578             }
    579             uprv_free(vloc);
    580         }
    581         uprv_free(rloc);
    582     }
    583 }
    585 // RuleBaseCollatorNew private constructor ----------------------------------
    587 RuleBasedCollator::RuleBasedCollator()
    588   : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL)
    589 {
    590 }
    592 RuleBasedCollator::RuleBasedCollator(const Locale& desiredLocale,
    593                                            UErrorCode& status)
    594  : dataIsOwned(FALSE), isWriteThroughAlias(FALSE), ucollator(NULL)
    595 {
    596     if (U_FAILURE(status))
    597         return;
    599     /*
    600     Try to load, in order:
    601      1. The desired locale's collation.
    602      2. A fallback of the desired locale.
    603      3. The default locale's collation.
    604      4. A fallback of the default locale.
    605      5. The default collation rules, which contains en_US collation rules.
    607      To reiterate, we try:
    608      Specific:
    609       language+country+variant
    610       language+country
    611       language
    612      Default:
    613       language+country+variant
    614       language+country
    615       language
    616      Root: (aka DEFAULTRULES)
    617      steps 1-5 are handled by resource bundle fallback mechanism.
    618      however, in a very unprobable situation that no resource bundle
    619      data exists, step 5 is repeated with hardcoded default rules.
    620     */
    622     setUCollator(desiredLocale, status);
    624     if (U_FAILURE(status))
    625     {
    626         status = U_ZERO_ERROR;
    628         setUCollator(kRootLocaleName, status);
    629         if (status == U_ZERO_ERROR) {
    630             status = U_USING_DEFAULT_WARNING;
    631         }
    632     }
    634     if (U_SUCCESS(status))
    635     {
    636         setRuleStringFromCollator();
    637     }
    638 }
    640 void
    641 RuleBasedCollator::setUCollator(const char *locale,
    642                                 UErrorCode &status)
    643 {
    644     if (U_FAILURE(status)) {
    645         return;
    646     }
    647     if (ucollator && dataIsOwned)
    648         ucol_close(ucollator);
    649     ucollator = ucol_open_internal(locale, &status);
    650     dataIsOwned = TRUE;
    651     isWriteThroughAlias = FALSE;
    652 }
    655 void
    656 RuleBasedCollator::checkOwned() {
    657     if (!(dataIsOwned || isWriteThroughAlias)) {
    658         UErrorCode status = U_ZERO_ERROR;
    659         ucollator = ucol_safeClone(ucollator, NULL, NULL, &status);
    660         setRuleStringFromCollator();
    661         dataIsOwned = TRUE;
    662         isWriteThroughAlias = FALSE;
    663     }
    664 }
    667 int32_t RuleBasedCollator::internalGetShortDefinitionString(const char *locale,
    668                                                                       char *buffer,
    669                                                                       int32_t capacity,
    670                                                                       UErrorCode &status) const {
    671   /* simply delegate */
    672   return ucol_getShortDefinitionString(ucollator, locale, buffer, capacity, &status);
    673 }
    680 #endif /* #if !UCONFIG_NO_COLLATION */