Home | History | Annotate | Download | only in i18n
      1 /*
      2  ******************************************************************************
      3  *   Copyright (C) 1996-2012, International Business Machines                 *
      4  *   Corporation and others.  All Rights Reserved.                            *
      5  ******************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_COLLATION
     11 
     12 #include "unicode/unistr.h"
     13 #include "unicode/putil.h"
     14 #include "unicode/usearch.h"
     15 
     16 #include "cmemory.h"
     17 #include "unicode/coll.h"
     18 #include "unicode/tblcoll.h"
     19 #include "unicode/coleitr.h"
     20 #include "unicode/ucoleitr.h"
     21 
     22 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
     23 
     24 #include "unicode/uniset.h"
     25 #include "unicode/uset.h"
     26 #include "unicode/ustring.h"
     27 #include "hash.h"
     28 #include "uhash.h"
     29 #include "ucln_in.h"
     30 #include "ucol_imp.h"
     31 #include "umutex.h"
     32 #include "uassert.h"
     33 
     34 #include "unicode/colldata.h"
     35 
     36 U_NAMESPACE_BEGIN
     37 
     38 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
     39 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
     40 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     41 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
     42 
     43 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList)
     44 
     45 #ifdef INSTRUMENT_CELIST
     46 int32_t CEList::_active = 0;
     47 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     48 #endif
     49 
     50 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
     51     : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
     52 {
     53     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
     54     UCollationStrength strength = ucol_getStrength(coll);
     55     UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
     56     uint32_t variableTop = ucol_getVariableTop(coll, &status);
     57     uint32_t strengthMask = 0;
     58     int32_t order;
     59 
     60     if (U_FAILURE(status)) {
     61         return;
     62     }
     63 
     64     // **** only set flag if string has Han(gul) ****
     65     ucol_forceHanImplicit(elems, &status);
     66 
     67     switch (strength)
     68     {
     69     default:
     70         strengthMask |= UCOL_TERTIARYORDERMASK;
     71         /* fall through */
     72 
     73     case UCOL_SECONDARY:
     74         strengthMask |= UCOL_SECONDARYORDERMASK;
     75         /* fall through */
     76 
     77     case UCOL_PRIMARY:
     78         strengthMask |= UCOL_PRIMARYORDERMASK;
     79     }
     80 
     81 #ifdef INSTRUMENT_CELIST
     82     _active += 1;
     83     _histogram[0] += 1;
     84 #endif
     85 
     86     ces = ceBuffer;
     87 
     88     while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
     89         UBool cont = isContinuation(order);
     90 
     91         order &= strengthMask;
     92 
     93         if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
     94             if (strength >= UCOL_QUATERNARY) {
     95                 order &= UCOL_PRIMARYORDERMASK;
     96             } else {
     97                 order = UCOL_IGNORABLE;
     98             }
     99         }
    100 
    101         if (order == UCOL_IGNORABLE) {
    102             continue;
    103         }
    104 
    105         if (cont) {
    106             order |= UCOL_CONTINUATION_MARKER;
    107         }
    108 
    109         add(order, status);
    110     }
    111 
    112     ucol_closeElements(elems);
    113 }
    114 
    115 CEList::~CEList()
    116 {
    117 #ifdef INSTRUMENT_CELIST
    118     _active -= 1;
    119 #endif
    120 
    121     if (ces != ceBuffer) {
    122         DELETE_ARRAY(ces);
    123     }
    124 }
    125 
    126 void CEList::add(uint32_t ce, UErrorCode &status)
    127 {
    128     if (U_FAILURE(status)) {
    129         return;
    130     }
    131 
    132     if (listSize >= listMax) {
    133         int32_t newMax = listMax + CELIST_BUFFER_SIZE;
    134 
    135 #ifdef INSTRUMENT_CELIST
    136         _histogram[listSize / CELIST_BUFFER_SIZE] += 1;
    137 #endif
    138 
    139         uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
    140 
    141         if (newCEs == NULL) {
    142             status = U_MEMORY_ALLOCATION_ERROR;
    143             return;
    144         }
    145 
    146         uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
    147 
    148         if (ces != ceBuffer) {
    149             DELETE_ARRAY(ces);
    150         }
    151 
    152         ces = newCEs;
    153         listMax = newMax;
    154     }
    155 
    156     ces[listSize++] = ce;
    157 }
    158 
    159 uint32_t CEList::get(int32_t index) const
    160 {
    161     if (index >= 0 && index < listSize) {
    162         return ces[index];
    163     }
    164 
    165     return (uint32_t)UCOL_NULLORDER;
    166 }
    167 
    168 uint32_t &CEList::operator[](int32_t index) const
    169 {
    170     return ces[index];
    171 }
    172 
    173 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
    174 {
    175     if (other == NULL || listSize - offset < other->size()) {
    176         return FALSE;
    177     }
    178 
    179     for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
    180         if (ces[i] != (*other)[j]) {
    181             return FALSE;
    182         }
    183     }
    184 
    185     return TRUE;
    186 }
    187 
    188 int32_t CEList::size() const
    189 {
    190     return listSize;
    191 }
    192 
    193 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList)
    194 
    195 #ifdef INSTRUMENT_STRING_LIST
    196 int32_t StringList::_lists = 0;
    197 int32_t StringList::_strings = 0;
    198 int32_t StringList::_histogram[101] = {0};
    199 #endif
    200 
    201 StringList::StringList(UErrorCode &status)
    202     : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
    203 {
    204     if (U_FAILURE(status)) {
    205         return;
    206     }
    207 
    208     strings = new UnicodeString [listMax];
    209 
    210     if (strings == NULL) {
    211         status = U_MEMORY_ALLOCATION_ERROR;
    212         return;
    213     }
    214 
    215 #ifdef INSTRUMENT_STRING_LIST
    216     _lists += 1;
    217     _histogram[0] += 1;
    218 #endif
    219 }
    220 
    221 StringList::~StringList()
    222 {
    223     delete[] strings;
    224 }
    225 
    226 void StringList::add(const UnicodeString *string, UErrorCode &status)
    227 {
    228     if (U_FAILURE(status)) {
    229         return;
    230     }
    231 
    232 #ifdef INSTRUMENT_STRING_LIST
    233     _strings += 1;
    234 #endif
    235 
    236     if (listSize >= listMax) {
    237         int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
    238         UnicodeString *newStrings = new UnicodeString[newMax];
    239         if (newStrings == NULL) {
    240             status = U_MEMORY_ALLOCATION_ERROR;
    241             return;
    242         }
    243         for (int32_t i=0; i<listSize; ++i) {
    244             newStrings[i] = strings[i];
    245         }
    246 
    247 #ifdef INSTRUMENT_STRING_LIST
    248         int32_t _h = listSize / STRING_LIST_BUFFER_SIZE;
    249 
    250         if (_h > 100) {
    251             _h = 100;
    252         }
    253 
    254         _histogram[_h] += 1;
    255 #endif
    256 
    257         delete[] strings;
    258         strings = newStrings;
    259         listMax = newMax;
    260     }
    261 
    262     // The ctor initialized all the strings in
    263     // the array to empty strings, so this
    264     // is the same as copying the source string.
    265     strings[listSize++].append(*string);
    266 }
    267 
    268 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
    269 {
    270     const UnicodeString string(chars, count);
    271 
    272     add(&string, status);
    273 }
    274 
    275 const UnicodeString *StringList::get(int32_t index) const
    276 {
    277     if (index >= 0 && index < listSize) {
    278         return &strings[index];
    279     }
    280 
    281     return NULL;
    282 }
    283 
    284 int32_t StringList::size() const
    285 {
    286     return listSize;
    287 }
    288 
    289 
    290 U_CDECL_BEGIN
    291 static void U_CALLCONV
    292 deleteStringList(void *obj)
    293 {
    294     StringList *strings = (StringList *) obj;
    295 
    296     delete strings;
    297 }
    298 static void U_CALLCONV
    299 deleteCEList(void *obj)
    300 {
    301     CEList *list = (CEList *) obj;
    302 
    303     delete list;
    304 }
    305 
    306 static void U_CALLCONV
    307 deleteUnicodeStringKey(void *obj)
    308 {
    309     UnicodeString *key = (UnicodeString *) obj;
    310 
    311     delete key;
    312 }
    313 
    314 static void U_CALLCONV
    315 deleteChars(void * /*obj*/)
    316 {
    317     // char *chars = (char *) obj;
    318     // All the key strings are owned by the
    319     // CollData objects and don't need to
    320     // be freed here.
    321   //DELETE_ARRAY(chars);
    322 }
    323 
    324 U_CDECL_END
    325 
    326 class CEToStringsMap : public UMemory
    327 {
    328 public:
    329 
    330     CEToStringsMap(UErrorCode &status);
    331     ~CEToStringsMap();
    332 
    333     void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
    334     StringList *getStringList(uint32_t ce) const;
    335 
    336 private:
    337 
    338     void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
    339     UHashtable *map;
    340 };
    341 
    342 CEToStringsMap::CEToStringsMap(UErrorCode &status)
    343     : map(NULL)
    344 {
    345     if (U_FAILURE(status)) {
    346         return;
    347     }
    348 
    349     map = uhash_open(uhash_hashLong, uhash_compareLong,
    350                      uhash_compareCaselessUnicodeString,
    351                      &status);
    352 
    353     if (U_FAILURE(status)) {
    354         return;
    355     }
    356 
    357     uhash_setValueDeleter(map, deleteStringList);
    358 }
    359 
    360 CEToStringsMap::~CEToStringsMap()
    361 {
    362     uhash_close(map);
    363 }
    364 
    365 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
    366 {
    367     StringList *strings = getStringList(ce);
    368 
    369     if (strings == NULL) {
    370         strings = new StringList(status);
    371 
    372         if (strings == NULL || U_FAILURE(status)) {
    373             status = U_MEMORY_ALLOCATION_ERROR;
    374             return;
    375         }
    376 
    377         putStringList(ce, strings, status);
    378     }
    379 
    380     strings->add(string, status);
    381 }
    382 
    383 StringList *CEToStringsMap::getStringList(uint32_t ce) const
    384 {
    385     return (StringList *) uhash_iget(map, ce);
    386 }
    387 
    388 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
    389 {
    390     uhash_iput(map, ce, (void *) stringList, &status);
    391 }
    392 
    393 class StringToCEsMap : public UMemory
    394 {
    395 public:
    396     StringToCEsMap(UErrorCode &status);
    397     ~StringToCEsMap();
    398 
    399     void put(const UnicodeString *string, const CEList *ces, UErrorCode &status);
    400     const CEList *get(const UnicodeString *string);
    401     void free(const CEList *list);
    402 
    403 private:
    404 
    405 
    406     UHashtable *map;
    407 };
    408 
    409 StringToCEsMap::StringToCEsMap(UErrorCode &status)
    410     : map(NULL)
    411 {
    412     if (U_FAILURE(status)) {
    413         return;
    414     }
    415 
    416     map = uhash_open(uhash_hashUnicodeString,
    417                      uhash_compareUnicodeString,
    418                      uhash_compareLong,
    419                      &status);
    420 
    421     if (U_FAILURE(status)) {
    422         return;
    423     }
    424 
    425     uhash_setValueDeleter(map, deleteCEList);
    426     uhash_setKeyDeleter(map, deleteUnicodeStringKey);
    427 }
    428 
    429 StringToCEsMap::~StringToCEsMap()
    430 {
    431     uhash_close(map);
    432 }
    433 
    434 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status)
    435 {
    436     uhash_put(map, (void *) string, (void *) ces, &status);
    437 }
    438 
    439 const CEList *StringToCEsMap::get(const UnicodeString *string)
    440 {
    441     return (const CEList *) uhash_get(map, string);
    442 }
    443 
    444 class CollDataCacheEntry : public UMemory
    445 {
    446 public:
    447     CollDataCacheEntry(CollData *theData);
    448     ~CollDataCacheEntry();
    449 
    450     CollData *data;
    451     int32_t   refCount;
    452 };
    453 
    454 CollDataCacheEntry::CollDataCacheEntry(CollData *theData)
    455     : data(theData), refCount(1)
    456 {
    457     // nothing else to do
    458 }
    459 
    460 CollDataCacheEntry::~CollDataCacheEntry()
    461 {
    462     // check refCount?
    463     delete data;
    464 }
    465 
    466 class CollDataCache : public UMemory
    467 {
    468 public:
    469     CollDataCache(UErrorCode &status);
    470     ~CollDataCache();
    471 
    472     CollData *get(UCollator *collator, UErrorCode &status);
    473     void unref(CollData *collData);
    474 
    475     void flush();
    476 
    477 private:
    478     static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength);
    479     static void deleteKey(char *key);
    480 
    481     UHashtable *cache;
    482 };
    483 static UMutex lock = U_MUTEX_INITIALIZER;
    484 
    485 U_CDECL_BEGIN
    486 static void U_CALLCONV
    487 deleteCollDataCacheEntry(void *obj)
    488 {
    489     CollDataCacheEntry *entry = (CollDataCacheEntry *) obj;
    490 
    491     delete entry;
    492 }
    493 U_CDECL_END
    494 
    495 CollDataCache::CollDataCache(UErrorCode &status)
    496     : cache(NULL)
    497 {
    498     if (U_FAILURE(status)) {
    499         return;
    500     }
    501 
    502     cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status);
    503 
    504     if (U_FAILURE(status)) {
    505         return;
    506     }
    507 
    508     uhash_setValueDeleter(cache, deleteCollDataCacheEntry);
    509     uhash_setKeyDeleter(cache, deleteChars);
    510 }
    511 
    512 CollDataCache::~CollDataCache()
    513 {
    514     umtx_lock(&lock);
    515     uhash_close(cache);
    516     cache = NULL;
    517     umtx_unlock(&lock);
    518 }
    519 
    520 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status)
    521 {
    522     char keyBuffer[KEY_BUFFER_SIZE];
    523     int32_t keyLength = KEY_BUFFER_SIZE;
    524     char *key = getKey(collator, keyBuffer, &keyLength);
    525     CollData *result = NULL, *newData = NULL;
    526     CollDataCacheEntry *entry = NULL, *newEntry = NULL;
    527 
    528     umtx_lock(&lock);
    529     entry = (CollDataCacheEntry *) uhash_get(cache, key);
    530 
    531     if (entry == NULL) {
    532         umtx_unlock(&lock);
    533 
    534         newData = new CollData(collator, key, keyLength, status);
    535         newEntry = new CollDataCacheEntry(newData);
    536 
    537         if (U_FAILURE(status) || newData == NULL || newEntry == NULL) {
    538             status = U_MEMORY_ALLOCATION_ERROR;
    539             return NULL;
    540         }
    541 
    542         umtx_lock(&lock);
    543         entry = (CollDataCacheEntry *) uhash_get(cache, key);
    544 
    545         if (entry == NULL) {
    546             uhash_put(cache, newData->key, newEntry, &status);
    547             umtx_unlock(&lock);
    548 
    549             if (U_FAILURE(status)) {
    550                 delete newEntry;
    551                 delete newData;
    552 
    553                 return NULL;
    554             }
    555 
    556             return newData;
    557         }
    558     }
    559 
    560     result = entry->data;
    561     entry->refCount += 1;
    562     umtx_unlock(&lock);
    563 
    564     if (key != keyBuffer) {
    565         deleteKey(key);
    566     }
    567 
    568     if (newEntry != NULL) {
    569         delete newEntry;
    570         delete newData;
    571     }
    572 
    573     return result;
    574 }
    575 
    576 void CollDataCache::unref(CollData *collData)
    577 {
    578     CollDataCacheEntry *entry = NULL;
    579 
    580     umtx_lock(&lock);
    581     entry = (CollDataCacheEntry *) uhash_get(cache, collData->key);
    582 
    583     if (entry != NULL) {
    584         entry->refCount -= 1;
    585     }
    586     umtx_unlock(&lock);
    587 }
    588 
    589 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength)
    590 {
    591     UErrorCode status = U_ZERO_ERROR;
    592     int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
    593 
    594     if (len >= *keyBufferLength) {
    595         *keyBufferLength = (len + 2) & ~1;  // round to even length, leaving room for terminating null
    596         keyBuffer = NEW_ARRAY(char, *keyBufferLength);
    597         status = U_ZERO_ERROR;
    598 
    599         len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
    600     }
    601 
    602     keyBuffer[len] = '\0';
    603 
    604     return keyBuffer;
    605 }
    606 
    607 void CollDataCache::flush()
    608 {
    609     const UHashElement *element;
    610     int32_t pos = -1;
    611 
    612     umtx_lock(&lock);
    613     while ((element = uhash_nextElement(cache, &pos)) != NULL) {
    614         CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer;
    615 
    616         if (entry->refCount <= 0) {
    617             uhash_removeElement(cache, element);
    618         }
    619     }
    620     umtx_unlock(&lock);
    621 }
    622 
    623 void CollDataCache::deleteKey(char *key)
    624 {
    625     DELETE_ARRAY(key);
    626 }
    627 
    628 U_CDECL_BEGIN
    629 static UBool coll_data_cleanup(void) {
    630     CollData::freeCollDataCache();
    631   return TRUE;
    632 }
    633 U_CDECL_END
    634 
    635 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)
    636 
    637 CollData::CollData()
    638 {
    639     // nothing
    640 }
    641 
    642 #define CLONE_COLLATOR
    643 
    644 //#define CACHE_CELISTS
    645 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status)
    646     : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL)
    647 {
    648     // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
    649     // i.e. other, control, private use, format, surrogate
    650     U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
    651     U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
    652     USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
    653 
    654     // Han ext. A, Han, Jamo, Hangul, Han Ext. B
    655     // i.e. all the characers we handle implicitly
    656     U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    657     U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    658     USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
    659 
    660     if (U_FAILURE(status)) {
    661         return;
    662     }
    663 
    664     USet *expansions   = uset_openEmpty();
    665     USet *contractions = uset_openEmpty();
    666     int32_t itemCount;
    667 
    668 #ifdef CACHE_CELISTS
    669     charsToCEList = new StringToCEsMap(status);
    670 
    671     if (U_FAILURE(status)) {
    672         goto bail;
    673     }
    674 #else
    675     charsToCEList = NULL;
    676 #endif
    677 
    678     ceToCharsStartingWith = new CEToStringsMap(status);
    679 
    680     if (U_FAILURE(status)) {
    681         goto bail;
    682     }
    683 
    684     if (cacheKeyLength > KEY_BUFFER_SIZE) {
    685         key = NEW_ARRAY(char, cacheKeyLength);
    686 
    687         if (key == NULL) {
    688             status = U_MEMORY_ALLOCATION_ERROR;
    689             goto bail;
    690         }
    691     } else {
    692         key = keyBuffer;
    693     }
    694 
    695     ARRAY_COPY(key, cacheKey, cacheKeyLength);
    696 
    697 #ifdef CLONE_COLLATOR
    698     coll = ucol_safeClone(collator, NULL, NULL, &status);
    699 
    700     if (U_FAILURE(status)) {
    701         goto bail;
    702     }
    703 #else
    704     coll = collator;
    705 #endif
    706 
    707     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
    708 
    709     uset_addAll(charsToTest, contractions);
    710     uset_addAll(charsToTest, expansions);
    711     uset_removeAll(charsToTest, charsToRemove);
    712 
    713     itemCount = uset_getItemCount(charsToTest);
    714     for(int32_t item = 0; item < itemCount; item += 1) {
    715         UChar32 start = 0, end = 0;
    716         UChar buffer[16];
    717         int32_t len = uset_getItem(charsToTest, item, &start, &end,
    718                                    buffer, 16, &status);
    719 
    720         if (len == 0) {
    721             for (UChar32 ch = start; ch <= end; ch += 1) {
    722                 UnicodeString *st = new UnicodeString(ch);
    723 
    724                 if (st == NULL) {
    725                     status = U_MEMORY_ALLOCATION_ERROR;
    726                     break;
    727                 }
    728 
    729                 CEList *ceList = new CEList(coll, *st, status);
    730 
    731                 ceToCharsStartingWith->put(ceList->get(0), st, status);
    732 
    733 #ifdef CACHE_CELISTS
    734                 charsToCEList->put(st, ceList, status);
    735 #else
    736                 delete ceList;
    737                 delete st;
    738 #endif
    739             }
    740         } else if (len > 0) {
    741             UnicodeString *st = new UnicodeString(buffer, len);
    742 
    743             if (st == NULL) {
    744                 status = U_MEMORY_ALLOCATION_ERROR;
    745                 break;
    746             }
    747 
    748             CEList *ceList = new CEList(coll, *st, status);
    749 
    750             ceToCharsStartingWith->put(ceList->get(0), st, status);
    751 
    752 #ifdef CACHE_CELISTS
    753             charsToCEList->put(st, ceList, status);
    754 #else
    755             delete ceList;
    756             delete st;
    757 #endif
    758         } else {
    759             // shouldn't happen...
    760         }
    761 
    762         if (U_FAILURE(status)) {
    763              break;
    764         }
    765     }
    766 
    767 bail:
    768     uset_close(contractions);
    769     uset_close(expansions);
    770     uset_close(charsToRemove);
    771     uset_close(charsToTest);
    772 
    773     if (U_FAILURE(status)) {
    774         return;
    775     }
    776 
    777      UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
    778                             UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
    779      UChar  jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
    780      UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
    781      UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
    782      CEList hanList(coll, hanString, status);
    783      CEList jamoList(coll, jamoString, status);
    784      int32_t j = 0;
    785 
    786      if (U_FAILURE(status)) {
    787          return;
    788      }
    789 
    790      for (int32_t c = 0; c < jamoList.size(); c += 1) {
    791          uint32_t jce = jamoList[c];
    792 
    793          if (! isContinuation(jce)) {
    794              jamoLimits[j++] = jce;
    795          }
    796      }
    797 
    798      jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
    799 
    800      minHan = 0xFFFFFFFF;
    801      maxHan = 0;
    802 
    803      for(int32_t h = 0; h < hanList.size(); h += 2) {
    804          uint32_t han = (uint32_t) hanList[h];
    805 
    806          if (han < minHan) {
    807              minHan = han;
    808          }
    809 
    810          if (han > maxHan) {
    811              maxHan = han;
    812          }
    813      }
    814 
    815      maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
    816 }
    817 
    818 CollData::~CollData()
    819 {
    820 #ifdef CLONE_COLLATOR
    821    ucol_close(coll);
    822 #endif
    823 
    824    if (key != keyBuffer) {
    825        DELETE_ARRAY(key);
    826    }
    827 
    828    delete ceToCharsStartingWith;
    829 
    830 #ifdef CACHE_CELISTS
    831    delete charsToCEList;
    832 #endif
    833 }
    834 
    835 UCollator *CollData::getCollator() const
    836 {
    837     return coll;
    838 }
    839 
    840 const StringList *CollData::getStringList(int32_t ce) const
    841 {
    842     return ceToCharsStartingWith->getStringList(ce);
    843 }
    844 
    845 const CEList *CollData::getCEList(const UnicodeString *string) const
    846 {
    847 #ifdef CACHE_CELISTS
    848     return charsToCEList->get(string);
    849 #else
    850     UErrorCode status = U_ZERO_ERROR;
    851     const CEList *list = new CEList(coll, *string, status);
    852 
    853     if (U_FAILURE(status)) {
    854         delete list;
    855         list = NULL;
    856     }
    857 
    858     return list;
    859 #endif
    860 }
    861 
    862 void CollData::freeCEList(const CEList *list)
    863 {
    864 #ifndef CACHE_CELISTS
    865     delete list;
    866 #endif
    867 }
    868 
    869 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
    870 {
    871     // find out shortest string for the longest sequence of ces.
    872     // this can probably be folded with the minLengthCache...
    873 
    874     if (history[offset] >= 0) {
    875         return history[offset];
    876     }
    877 
    878     uint32_t ce = ceList->get(offset);
    879     int32_t maxOffset = ceList->size();
    880     int32_t shortestLength = INT32_MAX;
    881     const StringList *strings = ceToCharsStartingWith->getStringList(ce);
    882 
    883     if (strings != NULL) {
    884         int32_t stringCount = strings->size();
    885 
    886         for (int32_t s = 0; s < stringCount; s += 1) {
    887             const UnicodeString *string = strings->get(s);
    888 #ifdef CACHE_CELISTS
    889             const CEList *ceList2 = charsToCEList->get(string);
    890 #else
    891             UErrorCode status = U_ZERO_ERROR;
    892             const CEList *ceList2 = new CEList(coll, *string, status);
    893 
    894             if (U_FAILURE(status)) {
    895                 delete ceList2;
    896                 ceList2 = NULL;
    897             }
    898 #endif
    899 
    900             if (ceList->matchesAt(offset, ceList2)) {
    901                 U_ASSERT(ceList2 != NULL);
    902                 int32_t clength = ceList2->size();
    903                 int32_t slength = string->length();
    904                 int32_t roffset = offset + clength;
    905                 int32_t rlength = 0;
    906 
    907                 if (roffset < maxOffset) {
    908                     rlength = minLengthInChars(ceList, roffset, history);
    909 
    910                     if (rlength <= 0) {
    911                     // delete before continue to avoid memory leak.
    912 #ifndef CACHE_CELISTS
    913                         delete ceList2;
    914 #endif
    915                         // ignore any dead ends
    916                         continue;
    917                     }
    918                 }
    919 
    920                 if (shortestLength > slength + rlength) {
    921                     shortestLength = slength + rlength;
    922                 }
    923             }
    924 
    925 #ifndef CACHE_CELISTS
    926             delete ceList2;
    927 #endif
    928         }
    929     }
    930 
    931     if (shortestLength == INT32_MAX) {
    932         // No matching strings at this offset. See if
    933         // the CE is in a range we can handle manually.
    934         if (ce >= minHan && ce < maxHan) {
    935             // all han have implicit orders which
    936             // generate two CEs.
    937             int32_t roffset = offset + 2;
    938             int32_t rlength = 0;
    939 
    940           //history[roffset++] = -1;
    941           //history[roffset++] = 1;
    942 
    943             if (roffset < maxOffset) {
    944                 rlength = minLengthInChars(ceList, roffset, history);
    945             }
    946 
    947             if (rlength < 0) {
    948                 return -1;
    949             }
    950 
    951             shortestLength = 1 + rlength;
    952             goto have_shortest;
    953         } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
    954             int32_t roffset = offset;
    955             int32_t rlength = 0;
    956 
    957             // **** this loop may not handle archaic Hangul correctly ****
    958             for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
    959                 uint32_t jce = ceList->get(roffset);
    960 
    961                 // Some Jamo have 24-bit primary order; skip the
    962                 // 2nd CE. This should always be OK because if
    963                 // we're still in the loop all we've seen are
    964                 // a series of Jamo in LVT order.
    965                 if (isContinuation(jce)) {
    966                     continue;
    967                 }
    968 
    969                 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
    970                     break;
    971                 }
    972             }
    973 
    974             if (roffset == offset) {
    975                 // we started with a non-L Jamo...
    976                 // just say it comes from a single character
    977                 roffset += 1;
    978 
    979                 // See if the single Jamo has a 24-bit order.
    980                 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
    981                     roffset += 1;
    982                 }
    983             }
    984 
    985             if (roffset < maxOffset) {
    986                 rlength = minLengthInChars(ceList, roffset, history);
    987             }
    988 
    989             if (rlength < 0) {
    990                 return -1;
    991             }
    992 
    993             shortestLength = 1 + rlength;
    994             goto have_shortest;
    995         }
    996 
    997         // Can't handle it manually either. Just move on.
    998         return -1;
    999     }
   1000 
   1001 have_shortest:
   1002     history[offset] = shortestLength;
   1003 
   1004     return shortestLength;
   1005 }
   1006 
   1007 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
   1008 {
   1009     int32_t clength = ceList->size();
   1010     int32_t *history = NEW_ARRAY(int32_t, clength);
   1011 
   1012     for (int32_t i = 0; i < clength; i += 1) {
   1013         history[i] = -1;
   1014     }
   1015 
   1016     int32_t minLength = minLengthInChars(ceList, offset, history);
   1017 
   1018     DELETE_ARRAY(history);
   1019 
   1020     return minLength;
   1021 }
   1022 
   1023 CollData *CollData::open(UCollator *collator, UErrorCode &status)
   1024 {
   1025     if (U_FAILURE(status)) {
   1026         return NULL;
   1027     }
   1028 
   1029     CollDataCache *cache = getCollDataCache();
   1030 
   1031     return cache->get(collator, status);
   1032 }
   1033 
   1034 void CollData::close(CollData *collData)
   1035 {
   1036     CollDataCache *cache = getCollDataCache();
   1037 
   1038     cache->unref(collData);
   1039 }
   1040 
   1041 CollDataCache *CollData::collDataCache = NULL;
   1042 
   1043 CollDataCache *CollData::getCollDataCache()
   1044 {
   1045     UErrorCode status = U_ZERO_ERROR;
   1046     CollDataCache *cache = NULL;
   1047 
   1048     UMTX_CHECK(NULL, collDataCache, cache);
   1049 
   1050     if (cache == NULL) {
   1051         cache = new CollDataCache(status);
   1052 
   1053         if (U_FAILURE(status)) {
   1054             delete cache;
   1055             return NULL;
   1056         }
   1057 
   1058         umtx_lock(NULL);
   1059         if (collDataCache == NULL) {
   1060             collDataCache = cache;
   1061 
   1062             ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup);
   1063         }
   1064         umtx_unlock(NULL);
   1065 
   1066         if (collDataCache != cache) {
   1067             delete cache;
   1068         }
   1069     }
   1070 
   1071     return collDataCache;
   1072 }
   1073 
   1074 void CollData::freeCollDataCache()
   1075 {
   1076     CollDataCache *cache = NULL;
   1077 
   1078     UMTX_CHECK(NULL, collDataCache, cache);
   1079 
   1080     if (cache != NULL) {
   1081         umtx_lock(NULL);
   1082         if (collDataCache != NULL) {
   1083             collDataCache = NULL;
   1084         } else {
   1085             cache = NULL;
   1086         }
   1087         umtx_unlock(NULL);
   1088 
   1089         delete cache;
   1090     }
   1091 }
   1092 
   1093 void CollData::flushCollDataCache()
   1094 {
   1095     CollDataCache *cache = NULL;
   1096 
   1097     UMTX_CHECK(NULL, collDataCache, cache);
   1098 
   1099     // **** this will fail if the another ****
   1100     // **** thread deletes the cache here ****
   1101     if (cache != NULL) {
   1102         cache->flush();
   1103     }
   1104 }
   1105 
   1106 U_NAMESPACE_END
   1107 
   1108 #endif // #if !UCONFIG_NO_COLLATION
   1109