Home | History | Annotate | Download | only in i18n
      1 /*
      2  ******************************************************************************
      3  *   Copyright (C) 1996-2009, International Business Machines                 *
      4  *   Corporation and others.  All Rights Reserved.                            *
      5  ******************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_COLLATION
     11 
     12 #include "unicode/unistr.h"
     13 #include "unicode/putil.h"
     14 #include "unicode/usearch.h"
     15 
     16 #include "cmemory.h"
     17 #include "unicode/coll.h"
     18 #include "unicode/tblcoll.h"
     19 #include "unicode/coleitr.h"
     20 #include "unicode/ucoleitr.h"
     21 
     22 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
     23 
     24 #include "unicode/uniset.h"
     25 #include "unicode/uset.h"
     26 #include "unicode/ustring.h"
     27 #include "hash.h"
     28 #include "uhash.h"
     29 #include "ucln_in.h"
     30 #include "ucol_imp.h"
     31 #include "umutex.h"
     32 
     33 #include "unicode/colldata.h"
     34 
     35 U_NAMESPACE_BEGIN
     36 
     37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
     38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
     39 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     40 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
     41 
     42 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList)
     43 
     44 #ifdef INSTRUMENT_CELIST
     45 int32_t CEList::_active = 0;
     46 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     47 #endif
     48 
     49 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
     50     : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
     51 {
     52     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
     53     UCollationStrength strength = ucol_getStrength(coll);
     54     UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
     55     uint32_t variableTop = ucol_getVariableTop(coll, &status);
     56     uint32_t strengthMask = 0;
     57     int32_t order;
     58 
     59     if (U_FAILURE(status)) {
     60         return;
     61     }
     62 
     63     // **** only set flag if string has Han(gul) ****
     64     ucol_forceHanImplicit(elems, &status);
     65 
     66     switch (strength)
     67     {
     68     default:
     69         strengthMask |= UCOL_TERTIARYORDERMASK;
     70         /* fall through */
     71 
     72     case UCOL_SECONDARY:
     73         strengthMask |= UCOL_SECONDARYORDERMASK;
     74         /* fall through */
     75 
     76     case UCOL_PRIMARY:
     77         strengthMask |= UCOL_PRIMARYORDERMASK;
     78     }
     79 
     80 #ifdef INSTRUMENT_CELIST
     81     _active += 1;
     82     _histogram[0] += 1;
     83 #endif
     84 
     85     ces = ceBuffer;
     86 
     87     while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
     88         UBool cont = isContinuation(order);
     89 
     90         order &= strengthMask;
     91 
     92         if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
     93             if (strength >= UCOL_QUATERNARY) {
     94                 order &= UCOL_PRIMARYORDERMASK;
     95             } else {
     96                 order = UCOL_IGNORABLE;
     97             }
     98         }
     99 
    100         if (order == UCOL_IGNORABLE) {
    101             continue;
    102         }
    103 
    104         if (cont) {
    105             order |= UCOL_CONTINUATION_MARKER;
    106         }
    107 
    108         add(order, status);
    109     }
    110 
    111     ucol_closeElements(elems);
    112 }
    113 
    114 CEList::~CEList()
    115 {
    116 #ifdef INSTRUMENT_CELIST
    117     _active -= 1;
    118 #endif
    119 
    120     if (ces != ceBuffer) {
    121         DELETE_ARRAY(ces);
    122     }
    123 }
    124 
    125 void CEList::add(uint32_t ce, UErrorCode &status)
    126 {
    127     if (U_FAILURE(status)) {
    128         return;
    129     }
    130 
    131     if (listSize >= listMax) {
    132         int32_t newMax = listMax + CELIST_BUFFER_SIZE;
    133 
    134 #ifdef INSTRUMENT_CELIST
    135         _histogram[listSize / CELIST_BUFFER_SIZE] += 1;
    136 #endif
    137 
    138         uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
    139 
    140         if (newCEs == NULL) {
    141             status = U_MEMORY_ALLOCATION_ERROR;
    142             return;
    143         }
    144 
    145         uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
    146 
    147         if (ces != ceBuffer) {
    148             DELETE_ARRAY(ces);
    149         }
    150 
    151         ces = newCEs;
    152         listMax = newMax;
    153     }
    154 
    155     ces[listSize++] = ce;
    156 }
    157 
    158 uint32_t CEList::get(int32_t index) const
    159 {
    160     if (index >= 0 && index < listSize) {
    161         return ces[index];
    162     }
    163 
    164     return UCOL_NULLORDER;
    165 }
    166 
    167 uint32_t &CEList::operator[](int32_t index) const
    168 {
    169     return ces[index];
    170 }
    171 
    172 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
    173 {
    174     if (other == NULL || listSize - offset < other->size()) {
    175         return FALSE;
    176     }
    177 
    178     for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
    179         if (ces[i] != (*other)[j]) {
    180             return FALSE;
    181         }
    182     }
    183 
    184     return TRUE;
    185 }
    186 
    187 int32_t CEList::size() const
    188 {
    189     return listSize;
    190 }
    191 
    192 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList)
    193 
    194 #ifdef INSTRUMENT_STRING_LIST
    195 int32_t StringList::_lists = 0;
    196 int32_t StringList::_strings = 0;
    197 int32_t StringList::_histogram[101] = {0};
    198 #endif
    199 
    200 StringList::StringList(UErrorCode &status)
    201     : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
    202 {
    203     if (U_FAILURE(status)) {
    204         return;
    205     }
    206 
    207     strings = new UnicodeString [listMax];
    208 
    209     if (strings == NULL) {
    210         status = U_MEMORY_ALLOCATION_ERROR;
    211         return;
    212     }
    213 
    214 #ifdef INSTRUMENT_STRING_LIST
    215     _lists += 1;
    216     _histogram[0] += 1;
    217 #endif
    218 }
    219 
    220 StringList::~StringList()
    221 {
    222     delete[] strings;
    223 }
    224 
    225 void StringList::add(const UnicodeString *string, UErrorCode &status)
    226 {
    227     if (U_FAILURE(status)) {
    228         return;
    229     }
    230 
    231 #ifdef INSTRUMENT_STRING_LIST
    232     _strings += 1;
    233 #endif
    234 
    235     if (listSize >= listMax) {
    236         int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
    237 
    238         UnicodeString *newStrings = new UnicodeString[newMax];
    239         if (newStrings == NULL) {
    240             status = U_MEMORY_ALLOCATION_ERROR;
    241             return;
    242         }
    243         for (int32_t i=0; i<listSize; ++i) {
    244             newStrings[i] = strings[i];
    245         }
    246 
    247 #ifdef INSTRUMENT_STRING_LIST
    248         int32_t _h = listSize / STRING_LIST_BUFFER_SIZE;
    249 
    250         if (_h > 100) {
    251             _h = 100;
    252         }
    253 
    254         _histogram[_h] += 1;
    255 #endif
    256 
    257         delete[] strings;
    258         strings = newStrings;
    259         listMax = newMax;
    260     }
    261 
    262     // The ctor initialized all the strings in
    263     // the array to empty strings, so this
    264     // is the same as copying the source string.
    265     strings[listSize++].append(*string);
    266 }
    267 
    268 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
    269 {
    270     const UnicodeString string(chars, count);
    271 
    272     add(&string, status);
    273 }
    274 
    275 const UnicodeString *StringList::get(int32_t index) const
    276 {
    277     if (index >= 0 && index < listSize) {
    278         return &strings[index];
    279     }
    280 
    281     return NULL;
    282 }
    283 
    284 int32_t StringList::size() const
    285 {
    286     return listSize;
    287 }
    288 
    289 
    290 U_CFUNC void deleteStringList(void *obj);
    291 
    292 class CEToStringsMap : public UMemory
    293 {
    294 public:
    295 
    296     CEToStringsMap(UErrorCode &status);
    297     ~CEToStringsMap();
    298 
    299     void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
    300     StringList *getStringList(uint32_t ce) const;
    301 
    302 private:
    303 
    304     void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
    305     UHashtable *map;
    306 };
    307 
    308 CEToStringsMap::CEToStringsMap(UErrorCode &status)
    309     : map(NULL)
    310 {
    311     if (U_FAILURE(status)) {
    312         return;
    313     }
    314 
    315     map = uhash_open(uhash_hashLong, uhash_compareLong,
    316                      uhash_compareCaselessUnicodeString,
    317                      &status);
    318 
    319     if (U_FAILURE(status)) {
    320         return;
    321     }
    322 
    323     uhash_setValueDeleter(map, deleteStringList);
    324 }
    325 
    326 CEToStringsMap::~CEToStringsMap()
    327 {
    328     uhash_close(map);
    329 }
    330 
    331 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
    332 {
    333     StringList *strings = getStringList(ce);
    334 
    335     if (strings == NULL) {
    336         strings = new StringList(status);
    337 
    338         if (strings == NULL || U_FAILURE(status)) {
    339             status = U_MEMORY_ALLOCATION_ERROR;
    340             return;
    341         }
    342 
    343         putStringList(ce, strings, status);
    344     }
    345 
    346     strings->add(string, status);
    347 }
    348 
    349 StringList *CEToStringsMap::getStringList(uint32_t ce) const
    350 {
    351     return (StringList *) uhash_iget(map, ce);
    352 }
    353 
    354 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
    355 {
    356     uhash_iput(map, ce, (void *) stringList, &status);
    357 }
    358 
    359 U_CFUNC void deleteStringList(void *obj)
    360 {
    361     StringList *strings = (StringList *) obj;
    362 
    363     delete strings;
    364 }
    365 
    366 U_CFUNC void deleteCEList(void *obj);
    367 U_CFUNC void deleteUnicodeStringKey(void *obj);
    368 
    369 class StringToCEsMap : public UMemory
    370 {
    371 public:
    372     StringToCEsMap(UErrorCode &status);
    373     ~StringToCEsMap();
    374 
    375     void put(const UnicodeString *string, const CEList *ces, UErrorCode &status);
    376     const CEList *get(const UnicodeString *string);
    377     void free(const CEList *list);
    378 
    379 private:
    380 
    381 
    382     UHashtable *map;
    383 };
    384 
    385 StringToCEsMap::StringToCEsMap(UErrorCode &status)
    386     : map(NULL)
    387 {
    388     if (U_FAILURE(status)) {
    389         return;
    390     }
    391 
    392     map = uhash_open(uhash_hashUnicodeString,
    393                      uhash_compareUnicodeString,
    394                      uhash_compareLong,
    395                      &status);
    396 
    397     if (U_FAILURE(status)) {
    398         return;
    399     }
    400 
    401     uhash_setValueDeleter(map, deleteCEList);
    402     uhash_setKeyDeleter(map, deleteUnicodeStringKey);
    403 }
    404 
    405 StringToCEsMap::~StringToCEsMap()
    406 {
    407     uhash_close(map);
    408 }
    409 
    410 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status)
    411 {
    412     uhash_put(map, (void *) string, (void *) ces, &status);
    413 }
    414 
    415 const CEList *StringToCEsMap::get(const UnicodeString *string)
    416 {
    417     return (const CEList *) uhash_get(map, string);
    418 }
    419 
    420 U_CFUNC void deleteCEList(void *obj)
    421 {
    422     CEList *list = (CEList *) obj;
    423 
    424     delete list;
    425 }
    426 
    427 U_CFUNC void deleteUnicodeStringKey(void *obj)
    428 {
    429     UnicodeString *key = (UnicodeString *) obj;
    430 
    431     delete key;
    432 }
    433 
    434 class CollDataCacheEntry : public UMemory
    435 {
    436 public:
    437     CollDataCacheEntry(CollData *theData);
    438     ~CollDataCacheEntry();
    439 
    440     CollData *data;
    441     int32_t   refCount;
    442 };
    443 
    444 CollDataCacheEntry::CollDataCacheEntry(CollData *theData)
    445     : data(theData), refCount(1)
    446 {
    447     // nothing else to do
    448 }
    449 
    450 CollDataCacheEntry::~CollDataCacheEntry()
    451 {
    452     // check refCount?
    453     delete data;
    454 }
    455 
    456 class CollDataCache : public UMemory
    457 {
    458 public:
    459     CollDataCache(UErrorCode &status);
    460     ~CollDataCache();
    461 
    462     CollData *get(UCollator *collator, UErrorCode &status);
    463     void unref(CollData *collData);
    464 
    465     void flush();
    466 
    467 private:
    468     static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength);
    469     static void deleteKey(char *key);
    470 
    471     UMTX lock;
    472     UHashtable *cache;
    473 };
    474 
    475 U_CFUNC void deleteChars(void * /*obj*/)
    476 {
    477     // char *chars = (char *) obj;
    478     // All the key strings are owned by the
    479     // CollData objects and don't need to
    480     // be freed here.
    481   //DELETE_ARRAY(chars);
    482 }
    483 
    484 U_CFUNC void deleteCollDataCacheEntry(void *obj)
    485 {
    486     CollDataCacheEntry *entry = (CollDataCacheEntry *) obj;
    487 
    488     delete entry;
    489 }
    490 
    491 CollDataCache::CollDataCache(UErrorCode &status)
    492     : lock(0), cache(NULL)
    493 {
    494     if (U_FAILURE(status)) {
    495         return;
    496     }
    497 
    498     cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status);
    499 
    500     if (U_FAILURE(status)) {
    501         return;
    502     }
    503 
    504     uhash_setValueDeleter(cache, deleteCollDataCacheEntry);
    505     uhash_setKeyDeleter(cache, deleteChars);
    506 }
    507 
    508 CollDataCache::~CollDataCache()
    509 {
    510     umtx_lock(&lock);
    511     uhash_close(cache);
    512     cache = NULL;
    513     umtx_unlock(&lock);
    514 
    515     umtx_destroy(&lock);
    516 }
    517 
    518 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status)
    519 {
    520     char keyBuffer[KEY_BUFFER_SIZE];
    521     int32_t keyLength = KEY_BUFFER_SIZE;
    522     char *key = getKey(collator, keyBuffer, &keyLength);
    523     CollData *result = NULL, *newData = NULL;
    524     CollDataCacheEntry *entry = NULL, *newEntry = NULL;
    525 
    526     umtx_lock(&lock);
    527     entry = (CollDataCacheEntry *) uhash_get(cache, key);
    528 
    529     if (entry == NULL) {
    530         umtx_unlock(&lock);
    531 
    532         newData = new CollData(collator, key, keyLength, status);
    533         newEntry = new CollDataCacheEntry(newData);
    534 
    535         if (U_FAILURE(status) || newData == NULL || newEntry == NULL) {
    536             status = U_MEMORY_ALLOCATION_ERROR;
    537             return NULL;
    538         }
    539 
    540         umtx_lock(&lock);
    541         entry = (CollDataCacheEntry *) uhash_get(cache, key);
    542 
    543         if (entry == NULL) {
    544             uhash_put(cache, newData->key, newEntry, &status);
    545             umtx_unlock(&lock);
    546 
    547             if (U_FAILURE(status)) {
    548                 delete newEntry;
    549                 delete newData;
    550 
    551                 return NULL;
    552             }
    553 
    554             return newData;
    555         }
    556     }
    557 
    558     result = entry->data;
    559     entry->refCount += 1;
    560     umtx_unlock(&lock);
    561 
    562     if (key != keyBuffer) {
    563         deleteKey(key);
    564     }
    565 
    566     if (newEntry != NULL) {
    567         delete newEntry;
    568         delete newData;
    569     }
    570 
    571     return result;
    572 }
    573 
    574 void CollDataCache::unref(CollData *collData)
    575 {
    576     CollDataCacheEntry *entry = NULL;
    577 
    578     umtx_lock(&lock);
    579     entry = (CollDataCacheEntry *) uhash_get(cache, collData->key);
    580 
    581     if (entry != NULL) {
    582         entry->refCount -= 1;
    583     }
    584     umtx_unlock(&lock);
    585 }
    586 
    587 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength)
    588 {
    589     UErrorCode status = U_ZERO_ERROR;
    590     int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
    591 
    592     if (len >= *keyBufferLength) {
    593         *keyBufferLength = (len + 2) & ~1;  // round to even length, leaving room for terminating null
    594         keyBuffer = NEW_ARRAY(char, *keyBufferLength);
    595         status = U_ZERO_ERROR;
    596 
    597         len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
    598     }
    599 
    600     keyBuffer[len] = '\0';
    601 
    602     return keyBuffer;
    603 }
    604 
    605 void CollDataCache::flush()
    606 {
    607     const UHashElement *element;
    608     int32_t pos = -1;
    609 
    610     umtx_lock(&lock);
    611     while ((element = uhash_nextElement(cache, &pos)) != NULL) {
    612         CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer;
    613 
    614         if (entry->refCount <= 0) {
    615             uhash_removeElement(cache, element);
    616         }
    617     }
    618     umtx_unlock(&lock);
    619 }
    620 
    621 void CollDataCache::deleteKey(char *key)
    622 {
    623     DELETE_ARRAY(key);
    624 }
    625 
    626 U_CDECL_BEGIN
    627 static UBool coll_data_cleanup(void) {
    628     CollData::freeCollDataCache();
    629   return TRUE;
    630 }
    631 U_CDECL_END
    632 
    633 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)
    634 
    635 CollData::CollData()
    636 {
    637     // nothing
    638 }
    639 
    640 #define CLONE_COLLATOR
    641 
    642 //#define CACHE_CELISTS
    643 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status)
    644     : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL)
    645 {
    646     // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
    647     // i.e. other, control, private use, format, surrogate
    648     U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
    649     U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
    650     USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
    651 
    652     // Han ext. A, Han, Jamo, Hangul, Han Ext. B
    653     // i.e. all the characers we handle implicitly
    654     U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    655     U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    656     USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
    657 
    658     if (U_FAILURE(status)) {
    659         return;
    660     }
    661 
    662     USet *expansions   = uset_openEmpty();
    663     USet *contractions = uset_openEmpty();
    664     int32_t itemCount;
    665 
    666 #ifdef CACHE_CELISTS
    667     charsToCEList = new StringToCEsMap(status);
    668 
    669     if (U_FAILURE(status)) {
    670         goto bail;
    671     }
    672 #else
    673     charsToCEList = NULL;
    674 #endif
    675 
    676     ceToCharsStartingWith = new CEToStringsMap(status);
    677 
    678     if (U_FAILURE(status)) {
    679         goto bail;
    680     }
    681 
    682     if (cacheKeyLength > KEY_BUFFER_SIZE) {
    683         key = NEW_ARRAY(char, cacheKeyLength);
    684 
    685         if (key == NULL) {
    686             status = U_MEMORY_ALLOCATION_ERROR;
    687             goto bail;
    688         }
    689     } else {
    690         key = keyBuffer;
    691     }
    692 
    693     ARRAY_COPY(key, cacheKey, cacheKeyLength);
    694 
    695 #ifdef CLONE_COLLATOR
    696     coll = ucol_safeClone(collator, NULL, NULL, &status);
    697 
    698     if (U_FAILURE(status)) {
    699         goto bail;
    700     }
    701 #else
    702     coll = collator;
    703 #endif
    704 
    705     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
    706 
    707     uset_addAll(charsToTest, contractions);
    708     uset_addAll(charsToTest, expansions);
    709     uset_removeAll(charsToTest, charsToRemove);
    710 
    711     itemCount = uset_getItemCount(charsToTest);
    712     for(int32_t item = 0; item < itemCount; item += 1) {
    713         UChar32 start = 0, end = 0;
    714         UChar buffer[16];
    715         int32_t len = uset_getItem(charsToTest, item, &start, &end,
    716                                    buffer, 16, &status);
    717 
    718         if (len == 0) {
    719             for (UChar32 ch = start; ch <= end; ch += 1) {
    720                 UnicodeString *st = new UnicodeString(ch);
    721 
    722                 if (st == NULL) {
    723                     status = U_MEMORY_ALLOCATION_ERROR;
    724                     break;
    725                 }
    726 
    727                 CEList *ceList = new CEList(coll, *st, status);
    728 
    729                 ceToCharsStartingWith->put(ceList->get(0), st, status);
    730 
    731 #ifdef CACHE_CELISTS
    732                 charsToCEList->put(st, ceList, status);
    733 #else
    734                 delete ceList;
    735                 delete st;
    736 #endif
    737             }
    738         } else if (len > 0) {
    739             UnicodeString *st = new UnicodeString(buffer, len);
    740 
    741             if (st == NULL) {
    742                 status = U_MEMORY_ALLOCATION_ERROR;
    743                 break;
    744             }
    745 
    746             CEList *ceList = new CEList(coll, *st, status);
    747 
    748             ceToCharsStartingWith->put(ceList->get(0), st, status);
    749 
    750 #ifdef CACHE_CELISTS
    751             charsToCEList->put(st, ceList, status);
    752 #else
    753             delete ceList;
    754             delete st;
    755 #endif
    756         } else {
    757             // shouldn't happen...
    758         }
    759 
    760         if (U_FAILURE(status)) {
    761              break;
    762         }
    763     }
    764 
    765 bail:
    766     uset_close(contractions);
    767     uset_close(expansions);
    768     uset_close(charsToRemove);
    769     uset_close(charsToTest);
    770 
    771     if (U_FAILURE(status)) {
    772         return;
    773     }
    774 
    775      UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
    776                             UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
    777      UChar  jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
    778      UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
    779      UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
    780      CEList hanList(coll, hanString, status);
    781      CEList jamoList(coll, jamoString, status);
    782      int32_t j = 0;
    783 
    784      if (U_FAILURE(status)) {
    785          return;
    786      }
    787 
    788      for (int32_t c = 0; c < jamoList.size(); c += 1) {
    789          uint32_t jce = jamoList[c];
    790 
    791          if (! isContinuation(jce)) {
    792              jamoLimits[j++] = jce;
    793          }
    794      }
    795 
    796      jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
    797 
    798      minHan = 0xFFFFFFFF;
    799      maxHan = 0;
    800 
    801      for(int32_t h = 0; h < hanList.size(); h += 2) {
    802          uint32_t han = (uint32_t) hanList[h];
    803 
    804          if (han < minHan) {
    805              minHan = han;
    806          }
    807 
    808          if (han > maxHan) {
    809              maxHan = han;
    810          }
    811      }
    812 
    813      maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
    814 }
    815 
    816 CollData::~CollData()
    817 {
    818 #ifdef CLONE_COLLATOR
    819    ucol_close(coll);
    820 #endif
    821 
    822    if (key != keyBuffer) {
    823        DELETE_ARRAY(key);
    824    }
    825 
    826    delete ceToCharsStartingWith;
    827 
    828 #ifdef CACHE_CELISTS
    829    delete charsToCEList;
    830 #endif
    831 }
    832 
    833 UCollator *CollData::getCollator() const
    834 {
    835     return coll;
    836 }
    837 
    838 const StringList *CollData::getStringList(int32_t ce) const
    839 {
    840     return ceToCharsStartingWith->getStringList(ce);
    841 }
    842 
    843 const CEList *CollData::getCEList(const UnicodeString *string) const
    844 {
    845 #ifdef CACHE_CELISTS
    846     return charsToCEList->get(string);
    847 #else
    848     UErrorCode status = U_ZERO_ERROR;
    849     const CEList *list = new CEList(coll, *string, status);
    850 
    851     if (U_FAILURE(status)) {
    852         delete list;
    853         list = NULL;
    854     }
    855 
    856     return list;
    857 #endif
    858 }
    859 
    860 void CollData::freeCEList(const CEList *list)
    861 {
    862 #ifndef CACHE_CELISTS
    863     delete list;
    864 #endif
    865 }
    866 
    867 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
    868 {
    869     // find out shortest string for the longest sequence of ces.
    870     // this can probably be folded with the minLengthCache...
    871 
    872     if (history[offset] >= 0) {
    873         return history[offset];
    874     }
    875 
    876     uint32_t ce = ceList->get(offset);
    877     int32_t maxOffset = ceList->size();
    878     int32_t shortestLength = INT32_MAX;
    879     const StringList *strings = ceToCharsStartingWith->getStringList(ce);
    880 
    881     if (strings != NULL) {
    882         int32_t stringCount = strings->size();
    883 
    884         for (int32_t s = 0; s < stringCount; s += 1) {
    885             const UnicodeString *string = strings->get(s);
    886 #ifdef CACHE_CELISTS
    887             const CEList *ceList2 = charsToCEList->get(string);
    888 #else
    889             UErrorCode status = U_ZERO_ERROR;
    890             const CEList *ceList2 = new CEList(coll, *string, status);
    891 
    892             if (U_FAILURE(status)) {
    893                 delete ceList2;
    894                 ceList2 = NULL;
    895             }
    896 #endif
    897 
    898             if (ceList->matchesAt(offset, ceList2)) {
    899                 int32_t clength = ceList2->size();
    900                 int32_t slength = string->length();
    901                 int32_t roffset = offset + clength;
    902                 int32_t rlength = 0;
    903 
    904                 if (roffset < maxOffset) {
    905                     rlength = minLengthInChars(ceList, roffset, history);
    906 
    907                     if (rlength <= 0) {
    908                     // delete before continue to avoid memory leak.
    909 #ifndef CACHE_CELISTS
    910                         delete ceList2;
    911 #endif
    912                         // ignore any dead ends
    913                         continue;
    914                     }
    915                 }
    916 
    917                 if (shortestLength > slength + rlength) {
    918                     shortestLength = slength + rlength;
    919                 }
    920             }
    921 
    922 #ifndef CACHE_CELISTS
    923             delete ceList2;
    924 #endif
    925         }
    926     }
    927 
    928     if (shortestLength == INT32_MAX) {
    929         // No matching strings at this offset. See if
    930         // the CE is in a range we can handle manually.
    931         if (ce >= minHan && ce < maxHan) {
    932             // all han have implicit orders which
    933             // generate two CEs.
    934             int32_t roffset = offset + 2;
    935             int32_t rlength = 0;
    936 
    937           //history[roffset++] = -1;
    938           //history[roffset++] = 1;
    939 
    940             if (roffset < maxOffset) {
    941                 rlength = minLengthInChars(ceList, roffset, history);
    942             }
    943 
    944             if (rlength < 0) {
    945                 return -1;
    946             }
    947 
    948             shortestLength = 1 + rlength;
    949             goto have_shortest;
    950         } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
    951             int32_t roffset = offset;
    952             int32_t rlength = 0;
    953 
    954             // **** this loop may not handle archaic Hangul correctly ****
    955             for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
    956                 uint32_t jce = ceList->get(roffset);
    957 
    958                 // Some Jamo have 24-bit primary order; skip the
    959                 // 2nd CE. This should always be OK because if
    960                 // we're still in the loop all we've seen are
    961                 // a series of Jamo in LVT order.
    962                 if (isContinuation(jce)) {
    963                     continue;
    964                 }
    965 
    966                 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
    967                     break;
    968                 }
    969             }
    970 
    971             if (roffset == offset) {
    972                 // we started with a non-L Jamo...
    973                 // just say it comes from a single character
    974                 roffset += 1;
    975 
    976                 // See if the single Jamo has a 24-bit order.
    977                 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
    978                     roffset += 1;
    979                 }
    980             }
    981 
    982             if (roffset < maxOffset) {
    983                 rlength = minLengthInChars(ceList, roffset, history);
    984             }
    985 
    986             if (rlength < 0) {
    987                 return -1;
    988             }
    989 
    990             shortestLength = 1 + rlength;
    991             goto have_shortest;
    992         }
    993 
    994         // Can't handle it manually either. Just move on.
    995         return -1;
    996     }
    997 
    998 have_shortest:
    999     history[offset] = shortestLength;
   1000 
   1001     return shortestLength;
   1002 }
   1003 
   1004 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
   1005 {
   1006     int32_t clength = ceList->size();
   1007     int32_t *history = NEW_ARRAY(int32_t, clength);
   1008 
   1009     for (int32_t i = 0; i < clength; i += 1) {
   1010         history[i] = -1;
   1011     }
   1012 
   1013     int32_t minLength = minLengthInChars(ceList, offset, history);
   1014 
   1015     DELETE_ARRAY(history);
   1016 
   1017     return minLength;
   1018 }
   1019 
   1020 CollData *CollData::open(UCollator *collator, UErrorCode &status)
   1021 {
   1022     if (U_FAILURE(status)) {
   1023         return NULL;
   1024     }
   1025 
   1026     CollDataCache *cache = getCollDataCache();
   1027 
   1028     return cache->get(collator, status);
   1029 }
   1030 
   1031 void CollData::close(CollData *collData)
   1032 {
   1033     CollDataCache *cache = getCollDataCache();
   1034 
   1035     cache->unref(collData);
   1036 }
   1037 
   1038 CollDataCache *CollData::collDataCache = NULL;
   1039 
   1040 CollDataCache *CollData::getCollDataCache()
   1041 {
   1042     UErrorCode status = U_ZERO_ERROR;
   1043     CollDataCache *cache = NULL;
   1044 
   1045     UMTX_CHECK(NULL, collDataCache, cache);
   1046 
   1047     if (cache == NULL) {
   1048         cache = new CollDataCache(status);
   1049 
   1050         if (U_FAILURE(status)) {
   1051             delete cache;
   1052             return NULL;
   1053         }
   1054 
   1055         umtx_lock(NULL);
   1056         if (collDataCache == NULL) {
   1057             collDataCache = cache;
   1058 
   1059             ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup);
   1060         }
   1061         umtx_unlock(NULL);
   1062 
   1063         if (collDataCache != cache) {
   1064             delete cache;
   1065         }
   1066     }
   1067 
   1068     return collDataCache;
   1069 }
   1070 
   1071 void CollData::freeCollDataCache()
   1072 {
   1073     CollDataCache *cache = NULL;
   1074 
   1075     UMTX_CHECK(NULL, collDataCache, cache);
   1076 
   1077     if (cache != NULL) {
   1078         umtx_lock(NULL);
   1079         if (collDataCache != NULL) {
   1080             collDataCache = NULL;
   1081         } else {
   1082             cache = NULL;
   1083         }
   1084         umtx_unlock(NULL);
   1085 
   1086         delete cache;
   1087     }
   1088 }
   1089 
   1090 void CollData::flushCollDataCache()
   1091 {
   1092     CollDataCache *cache = NULL;
   1093 
   1094     UMTX_CHECK(NULL, collDataCache, cache);
   1095 
   1096     // **** this will fail if the another ****
   1097     // **** thread deletes the cache here ****
   1098     if (cache != NULL) {
   1099         cache->flush();
   1100     }
   1101 }
   1102 
   1103 U_NAMESPACE_END
   1104 
   1105 #endif // #if !UCONFIG_NO_COLLATION
   1106