Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 1996-2006, International Business Machines Corporation and    *
      4 * others. All Rights Reserved.                                                *
      5 *******************************************************************************
      6 */
      7 //===============================================================================
      8 //
      9 // File sortkey.cpp
     10 //
     11 //
     12 //
     13 // Created by: Helena Shih
     14 //
     15 // Modification History:
     16 //
     17 //  Date         Name          Description
     18 //
     19 //  6/20/97      helena        Java class name change.
     20 //  6/23/97      helena        Added comments to make code more readable.
     21 //  6/26/98      erm           Canged to use byte arrays instead of UnicodeString
     22 //  7/31/98      erm           hashCode: minimum inc should be 2 not 1,
     23 //                             Cleaned up operator=
     24 // 07/12/99      helena        HPUX 11 CC port.
     25 // 03/06/01      synwee        Modified compareTo, to handle the result of
     26 //                             2 string similar in contents, but one is longer
     27 //                             than the other
     28 //===============================================================================
     29 
     30 #include "unicode/utypes.h"
     31 
     32 #if !UCONFIG_NO_COLLATION
     33 
     34 #include "unicode/sortkey.h"
     35 #include "cmemory.h"
     36 #include "uhash.h"
     37 
     38 U_NAMESPACE_BEGIN
     39 
     40 // A hash code of kInvalidHashCode indicates that the has code needs
     41 // to be computed. A hash code of kEmptyHashCode is used for empty keys
     42 // and for any key whose computed hash code is kInvalidHashCode.
     43 #define kInvalidHashCode ((int32_t)0)
     44 #define kEmptyHashCode ((int32_t)1)
     45 
     46 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollationKey)
     47 
     48 CollationKey::CollationKey()
     49     : UObject(), fBogus(FALSE), fCount(0), fCapacity(0),
     50       fHashCode(kEmptyHashCode), fBytes(NULL)
     51 {
     52 }
     53 
     54 // Create a collation key from a bit array.
     55 CollationKey::CollationKey(const uint8_t* newValues, int32_t count)
     56     : UObject(), fBogus(FALSE), fCount(count), fCapacity(count),
     57       fHashCode(kInvalidHashCode)
     58 {
     59     fBytes = (uint8_t *)uprv_malloc(count);
     60 
     61     if (fBytes == NULL)
     62     {
     63         setToBogus();
     64         return;
     65     }
     66 
     67     uprv_memcpy(fBytes, newValues, fCount);
     68 }
     69 
     70 CollationKey::CollationKey(const CollationKey& other)
     71 : UObject(other), fBogus(FALSE), fCount(other.fCount), fCapacity(other.fCapacity),
     72   fHashCode(other.fHashCode), fBytes(NULL)
     73 {
     74     if (other.fBogus)
     75     {
     76         setToBogus();
     77         return;
     78     }
     79 
     80     fBytes = (uint8_t *)uprv_malloc(fCapacity);
     81 
     82     if (fBytes == NULL)
     83     {
     84         setToBogus();
     85         return;
     86     }
     87 
     88     uprv_memcpy(fBytes, other.fBytes, other.fCount);
     89     if(fCapacity>fCount) {
     90         uprv_memset(fBytes+fCount, 0, fCapacity-fCount);
     91     }
     92 }
     93 
     94 CollationKey::~CollationKey()
     95 {
     96         uprv_free(fBytes);
     97 }
     98 
     99 void CollationKey::adopt(uint8_t *values, int32_t count) {
    100     if(fBytes != NULL) {
    101         uprv_free(fBytes);
    102     }
    103     fBogus = FALSE;
    104     fBytes = values;
    105     fCount = count;
    106     fCapacity = count;
    107     fHashCode = kInvalidHashCode;
    108 }
    109 
    110 // set the key to an empty state
    111 CollationKey&
    112 CollationKey::reset()
    113 {
    114     fCount = 0;
    115     fBogus = FALSE;
    116     fHashCode = kEmptyHashCode;
    117 
    118     return *this;
    119 }
    120 
    121 // set the key to a "bogus" or invalid state
    122 CollationKey&
    123 CollationKey::setToBogus()
    124 {
    125     uprv_free(fBytes);
    126     fBytes = NULL;
    127 
    128     fCapacity = 0;
    129     fCount = 0;
    130     fHashCode = kInvalidHashCode;
    131 
    132     return *this;
    133 }
    134 
    135 UBool
    136 CollationKey::operator==(const CollationKey& source) const
    137 {
    138     return (this->fCount == source.fCount &&
    139             (this->fBytes == source.fBytes ||
    140              uprv_memcmp(this->fBytes, source.fBytes, this->fCount) == 0));
    141 }
    142 
    143 const CollationKey&
    144 CollationKey::operator=(const CollationKey& other)
    145 {
    146     if (this != &other)
    147     {
    148         if (other.isBogus())
    149         {
    150             return setToBogus();
    151         }
    152 
    153         if (other.fBytes != NULL)
    154         {
    155             ensureCapacity(other.fCount);
    156 
    157             if (isBogus())
    158             {
    159                 return *this;
    160             }
    161 
    162             fHashCode = other.fHashCode;
    163             uprv_memcpy(fBytes, other.fBytes, fCount);
    164         }
    165         else
    166         {
    167             fCount = 0;
    168             fBogus = FALSE;
    169             fHashCode = kEmptyHashCode;
    170         }
    171     }
    172 
    173     return *this;
    174 }
    175 
    176 // Bitwise comparison for the collation keys.
    177 // NOTE: this is somewhat messy 'cause we can't count
    178 // on memcmp returning the exact values which match
    179 // Collator::EComparisonResult
    180 Collator::EComparisonResult
    181 CollationKey::compareTo(const CollationKey& target) const
    182 {
    183     uint8_t *src = this->fBytes;
    184     uint8_t *tgt = target.fBytes;
    185 
    186     // are we comparing the same string
    187     if (src == tgt)
    188         return  Collator::EQUAL;
    189 
    190         /*
    191         int count = (this->fCount < target.fCount) ? this->fCount : target.fCount;
    192         if (count == 0)
    193         {
    194         // If count is 0, at least one of the keys is empty.
    195         // An empty key is always LESS than a non-empty one
    196         // and EQUAL to another empty
    197         if (this->fCount < target.fCount)
    198         {
    199         return Collator::LESS;
    200         }
    201 
    202           if (this->fCount > target.fCount)
    203           {
    204           return Collator::GREATER;
    205           }
    206           return Collator::EQUAL;
    207           }
    208     */
    209 
    210     int                         minLength;
    211     Collator::EComparisonResult result;
    212 
    213     // are we comparing different lengths?
    214     if (this->fCount != target.fCount) {
    215         if (this->fCount < target.fCount) {
    216             minLength = this->fCount;
    217             result    =  Collator::LESS;
    218         }
    219         else {
    220             minLength = target.fCount;
    221             result    =  Collator::GREATER;
    222         }
    223     }
    224     else {
    225         minLength = target.fCount;
    226         result    =  Collator::EQUAL;
    227     }
    228 
    229     if (minLength > 0) {
    230         int diff = uprv_memcmp(src, tgt, minLength);
    231         if (diff > 0) {
    232             return Collator::GREATER;
    233         }
    234         else
    235             if (diff < 0) {
    236                 return Collator::LESS;
    237             }
    238     }
    239 
    240     return result;
    241     /*
    242     if (result < 0)
    243     {
    244     return Collator::LESS;
    245     }
    246 
    247       if (result > 0)
    248       {
    249       return Collator::GREATER;
    250       }
    251       return Collator::EQUAL;
    252     */
    253 }
    254 
    255 // Bitwise comparison for the collation keys.
    256 UCollationResult
    257 CollationKey::compareTo(const CollationKey& target, UErrorCode &status) const
    258 {
    259   if(U_SUCCESS(status)) {
    260     uint8_t *src = this->fBytes;
    261     uint8_t *tgt = target.fBytes;
    262 
    263     // are we comparing the same string
    264     if (src == tgt)
    265         return  UCOL_EQUAL;
    266 
    267     int                         minLength;
    268     UCollationResult result;
    269 
    270     // are we comparing different lengths?
    271     if (this->fCount != target.fCount) {
    272         if (this->fCount < target.fCount) {
    273             minLength = this->fCount;
    274             result    =  UCOL_LESS;
    275         }
    276         else {
    277             minLength = target.fCount;
    278             result    =  UCOL_GREATER;
    279         }
    280     }
    281     else {
    282         minLength = target.fCount;
    283         result    =  UCOL_EQUAL;
    284     }
    285 
    286     if (minLength > 0) {
    287         int diff = uprv_memcmp(src, tgt, minLength);
    288         if (diff > 0) {
    289             return UCOL_GREATER;
    290         }
    291         else
    292             if (diff < 0) {
    293                 return UCOL_LESS;
    294             }
    295     }
    296 
    297     return result;
    298   } else {
    299     return UCOL_EQUAL;
    300   }
    301 }
    302 
    303 CollationKey&
    304 CollationKey::ensureCapacity(int32_t newSize)
    305 {
    306     if (fCapacity < newSize)
    307     {
    308         uprv_free(fBytes);
    309 
    310         fBytes = (uint8_t *)uprv_malloc(newSize);
    311 
    312         if (fBytes == NULL)
    313         {
    314             return setToBogus();
    315         }
    316 
    317         uprv_memset(fBytes, 0, fCapacity);
    318         fCapacity = newSize;
    319     }
    320 
    321     fBogus = FALSE;
    322     fCount = newSize;
    323     fHashCode = kInvalidHashCode;
    324 
    325     return *this;
    326 }
    327 
    328 #ifdef U_USE_COLLATION_KEY_DEPRECATES
    329 // Create a copy of the byte array.
    330 uint8_t*
    331 CollationKey::toByteArray(int32_t& count) const
    332 {
    333     uint8_t *result = (uint8_t*) uprv_malloc( sizeof(uint8_t) * fCount );
    334 
    335     if (result == NULL)
    336     {
    337         count = 0;
    338     }
    339     else
    340     {
    341         count = fCount;
    342         uprv_memcpy(result, fBytes, fCount);
    343     }
    344 
    345     return result;
    346 }
    347 #endif
    348 
    349 int32_t
    350 CollationKey::hashCode() const
    351 {
    352     // (Cribbed from UnicodeString)
    353     // We cache the hashCode; when it becomes invalid, due to any change to the
    354     // string, we note this by setting it to kInvalidHashCode. [LIU]
    355 
    356     // Note: This method is semantically const, but physically non-const.
    357 
    358     if (fHashCode == kInvalidHashCode)
    359     {
    360         UHashTok key;
    361         key.pointer = fBytes;
    362         ((CollationKey *)this)->fHashCode = uhash_hashChars(key);
    363 #if 0
    364         // We compute the hash by iterating sparsely over 64 (at most) characters
    365         // spaced evenly through the string.  For each character, we multiply the
    366         // previous hash value by a prime number and add the new character in,
    367         // in the manner of a additive linear congruential random number generator,
    368         // thus producing a pseudorandom deterministic value which should be well
    369         // distributed over the output range. [LIU]
    370         const uint8_t   *p = fBytes, *limit = fBytes + fCount;
    371         int32_t         inc = (fCount >= 256) ? fCount/128 : 2; // inc = max(fSize/64, 1);
    372         int32_t         hash = 0;
    373 
    374         while (p < limit)
    375         {
    376             hash = ( hash * 37 ) + ((p[0] << 8) + p[1]);
    377             p += inc;
    378         }
    379 
    380         // If we happened to get kInvalidHashCode, replace it with kEmptyHashCode
    381         if (hash == kInvalidHashCode)
    382         {
    383             hash = kEmptyHashCode;
    384         }
    385 
    386         ((CollationKey *)this)->fHashCode = hash; // cast away const
    387 #endif
    388     }
    389 
    390     return fHashCode;
    391 }
    392 
    393 U_NAMESPACE_END
    394 
    395 U_CAPI int32_t U_EXPORT2
    396 ucol_keyHashCode(const uint8_t *key,
    397                        int32_t  length)
    398 {
    399     U_NAMESPACE_QUALIFIER CollationKey newKey(key, length);
    400     return newKey.hashCode();
    401 }
    402 
    403 #endif /* #if !UCONFIG_NO_COLLATION */
    404