Home | History | Annotate | Download | only in i18n
      1 /*
      2  ******************************************************************************
      3  *   Copyright (C) 1996-2009, International Business Machines                 *
      4  *   Corporation and others.  All Rights Reserved.                            *
      5  ******************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_COLLATION
     11 
     12 #include "unicode/unistr.h"
     13 #include "unicode/putil.h"
     14 #include "unicode/usearch.h"
     15 
     16 #include "cmemory.h"
     17 #include "unicode/coll.h"
     18 #include "unicode/tblcoll.h"
     19 #include "unicode/coleitr.h"
     20 #include "unicode/ucoleitr.h"
     21 
     22 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
     23 
     24 #include "unicode/uniset.h"
     25 #include "unicode/uset.h"
     26 #include "unicode/ustring.h"
     27 #include "hash.h"
     28 #include "uhash.h"
     29 #include "ucln_in.h"
     30 #include "ucol_imp.h"
     31 #include "umutex.h"
     32 
     33 #include "unicode/colldata.h"
     34 
     35 U_NAMESPACE_BEGIN
     36 
     37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
     38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
     39 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     40 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
     41 
     42 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList)
     43 
     44 #ifdef INSTRUMENT_CELIST
     45 int32_t CEList::_active = 0;
     46 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     47 #endif
     48 
     49 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
     50     : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
     51 {
     52     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
     53     UCollationStrength strength = ucol_getStrength(coll);
     54     UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
     55     uint32_t variableTop = ucol_getVariableTop(coll, &status);
     56     uint32_t strengthMask = 0;
     57     int32_t order;
     58 
     59     if (U_FAILURE(status)) {
     60         return;
     61     }
     62 
     63     // **** only set flag if string has Han(gul) ****
     64     ucol_forceHanImplicit(elems, &status);
     65 
     66     switch (strength)
     67     {
     68     default:
     69         strengthMask |= UCOL_TERTIARYORDERMASK;
     70         /* fall through */
     71 
     72     case UCOL_SECONDARY:
     73         strengthMask |= UCOL_SECONDARYORDERMASK;
     74         /* fall through */
     75 
     76     case UCOL_PRIMARY:
     77         strengthMask |= UCOL_PRIMARYORDERMASK;
     78     }
     79 
     80 #ifdef INSTRUMENT_CELIST
     81     _active += 1;
     82     _histogram[0] += 1;
     83 #endif
     84 
     85     ces = ceBuffer;
     86 
     87     while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
     88         UBool cont = isContinuation(order);
     89 
     90         order &= strengthMask;
     91 
     92         if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
     93             if (strength >= UCOL_QUATERNARY) {
     94                 order &= UCOL_PRIMARYORDERMASK;
     95             } else {
     96                 order = UCOL_IGNORABLE;
     97             }
     98         }
     99 
    100         if (order == UCOL_IGNORABLE) {
    101             continue;
    102         }
    103 
    104         if (cont) {
    105             order |= UCOL_CONTINUATION_MARKER;
    106         }
    107 
    108         add(order, status);
    109     }
    110 
    111     ucol_closeElements(elems);
    112 }
    113 
    114 CEList::~CEList()
    115 {
    116 #ifdef INSTRUMENT_CELIST
    117     _active -= 1;
    118 #endif
    119 
    120     if (ces != ceBuffer) {
    121         DELETE_ARRAY(ces);
    122     }
    123 }
    124 
    125 void CEList::add(uint32_t ce, UErrorCode &status)
    126 {
    127     if (U_FAILURE(status)) {
    128         return;
    129     }
    130 
    131     if (listSize >= listMax) {
    132         int32_t newMax = listMax + CELIST_BUFFER_SIZE;
    133 
    134 #ifdef INSTRUMENT_CELIST
    135         _histogram[listSize / CELIST_BUFFER_SIZE] += 1;
    136 #endif
    137 
    138         uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
    139 
    140         if (newCEs == NULL) {
    141             status = U_MEMORY_ALLOCATION_ERROR;
    142             return;
    143         }
    144 
    145         uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
    146 
    147         if (ces != ceBuffer) {
    148             DELETE_ARRAY(ces);
    149         }
    150 
    151         ces = newCEs;
    152         listMax = newMax;
    153     }
    154 
    155     ces[listSize++] = ce;
    156 }
    157 
    158 uint32_t CEList::get(int32_t index) const
    159 {
    160     if (index >= 0 && index < listSize) {
    161         return ces[index];
    162     }
    163 
    164     return UCOL_NULLORDER;
    165 }
    166 
    167 uint32_t &CEList::operator[](int32_t index) const
    168 {
    169     return ces[index];
    170 }
    171 
    172 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
    173 {
    174     if (other == NULL || listSize - offset < other->size()) {
    175         return FALSE;
    176     }
    177 
    178     for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
    179         if (ces[i] != (*other)[j]) {
    180             return FALSE;
    181         }
    182     }
    183 
    184     return TRUE;
    185 }
    186 
    187 int32_t CEList::size() const
    188 {
    189     return listSize;
    190 }
    191 
    192 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList)
    193 
    194 #ifdef INSTRUMENT_STRING_LIST
    195 int32_t StringList::_lists = 0;
    196 int32_t StringList::_strings = 0;
    197 int32_t StringList::_histogram[101] = {0};
    198 #endif
    199 
    200 StringList::StringList(UErrorCode &status)
    201     : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
    202 {
    203     if (U_FAILURE(status)) {
    204         return;
    205     }
    206 
    207     strings = new UnicodeString [listMax];
    208 
    209     if (strings == NULL) {
    210         status = U_MEMORY_ALLOCATION_ERROR;
    211         return;
    212     }
    213 
    214 #ifdef INSTRUMENT_STRING_LIST
    215     _lists += 1;
    216     _histogram[0] += 1;
    217 #endif
    218 }
    219 
    220 StringList::~StringList()
    221 {
    222     delete[] strings;
    223 }
    224 
    225 void StringList::add(const UnicodeString *string, UErrorCode &status)
    226 {
    227     if (U_FAILURE(status)) {
    228         return;
    229     }
    230 
    231 #ifdef INSTRUMENT_STRING_LIST
    232     _strings += 1;
    233 #endif
    234 
    235     if (listSize >= listMax) {
    236         int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
    237 
    238         UnicodeString *newStrings = new UnicodeString[newMax];
    239 
    240         uprv_memcpy(newStrings, strings, listSize * sizeof(UnicodeString));
    241 
    242 #ifdef INSTRUMENT_STRING_LIST
    243         int32_t _h = listSize / STRING_LIST_BUFFER_SIZE;
    244 
    245         if (_h > 100) {
    246             _h = 100;
    247         }
    248 
    249         _histogram[_h] += 1;
    250 #endif
    251 
    252         delete[] strings;
    253         strings = newStrings;
    254         listMax = newMax;
    255     }
    256 
    257     // The ctor initialized all the strings in
    258     // the array to empty strings, so this
    259     // is the same as copying the source string.
    260     strings[listSize++].append(*string);
    261 }
    262 
    263 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
    264 {
    265     const UnicodeString string(chars, count);
    266 
    267     add(&string, status);
    268 }
    269 
    270 const UnicodeString *StringList::get(int32_t index) const
    271 {
    272     if (index >= 0 && index < listSize) {
    273         return &strings[index];
    274     }
    275 
    276     return NULL;
    277 }
    278 
    279 int32_t StringList::size() const
    280 {
    281     return listSize;
    282 }
    283 
    284 
    285 U_CFUNC void deleteStringList(void *obj);
    286 
    287 class CEToStringsMap : public UMemory
    288 {
    289 public:
    290 
    291     CEToStringsMap(UErrorCode &status);
    292     ~CEToStringsMap();
    293 
    294     void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
    295     StringList *getStringList(uint32_t ce) const;
    296 
    297 private:
    298 
    299     void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
    300     UHashtable *map;
    301 };
    302 
    303 CEToStringsMap::CEToStringsMap(UErrorCode &status)
    304     : map(NULL)
    305 {
    306     if (U_FAILURE(status)) {
    307         return;
    308     }
    309 
    310     map = uhash_open(uhash_hashLong, uhash_compareLong,
    311                      uhash_compareCaselessUnicodeString,
    312                      &status);
    313 
    314     if (U_FAILURE(status)) {
    315         return;
    316     }
    317 
    318     uhash_setValueDeleter(map, deleteStringList);
    319 }
    320 
    321 CEToStringsMap::~CEToStringsMap()
    322 {
    323     uhash_close(map);
    324 }
    325 
    326 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
    327 {
    328     StringList *strings = getStringList(ce);
    329 
    330     if (strings == NULL) {
    331         strings = new StringList(status);
    332 
    333         if (strings == NULL || U_FAILURE(status)) {
    334             status = U_MEMORY_ALLOCATION_ERROR;
    335             return;
    336         }
    337 
    338         putStringList(ce, strings, status);
    339     }
    340 
    341     strings->add(string, status);
    342 }
    343 
    344 StringList *CEToStringsMap::getStringList(uint32_t ce) const
    345 {
    346     return (StringList *) uhash_iget(map, ce);
    347 }
    348 
    349 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
    350 {
    351     uhash_iput(map, ce, (void *) stringList, &status);
    352 }
    353 
    354 U_CFUNC void deleteStringList(void *obj)
    355 {
    356     StringList *strings = (StringList *) obj;
    357 
    358     delete strings;
    359 }
    360 
    361 U_CFUNC void deleteCEList(void *obj);
    362 U_CFUNC void deleteUnicodeStringKey(void *obj);
    363 
    364 class StringToCEsMap : public UMemory
    365 {
    366 public:
    367     StringToCEsMap(UErrorCode &status);
    368     ~StringToCEsMap();
    369 
    370     void put(const UnicodeString *string, const CEList *ces, UErrorCode &status);
    371     const CEList *get(const UnicodeString *string);
    372     void free(const CEList *list);
    373 
    374 private:
    375 
    376 
    377     UHashtable *map;
    378 };
    379 
    380 StringToCEsMap::StringToCEsMap(UErrorCode &status)
    381     : map(NULL)
    382 {
    383     if (U_FAILURE(status)) {
    384         return;
    385     }
    386 
    387     map = uhash_open(uhash_hashUnicodeString,
    388                      uhash_compareUnicodeString,
    389                      uhash_compareLong,
    390                      &status);
    391 
    392     if (U_FAILURE(status)) {
    393         return;
    394     }
    395 
    396     uhash_setValueDeleter(map, deleteCEList);
    397     uhash_setKeyDeleter(map, deleteUnicodeStringKey);
    398 }
    399 
    400 StringToCEsMap::~StringToCEsMap()
    401 {
    402     uhash_close(map);
    403 }
    404 
    405 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status)
    406 {
    407     uhash_put(map, (void *) string, (void *) ces, &status);
    408 }
    409 
    410 const CEList *StringToCEsMap::get(const UnicodeString *string)
    411 {
    412     return (const CEList *) uhash_get(map, string);
    413 }
    414 
    415 U_CFUNC void deleteCEList(void *obj)
    416 {
    417     CEList *list = (CEList *) obj;
    418 
    419     delete list;
    420 }
    421 
    422 U_CFUNC void deleteUnicodeStringKey(void *obj)
    423 {
    424     UnicodeString *key = (UnicodeString *) obj;
    425 
    426     delete key;
    427 }
    428 
    429 class CollDataCacheEntry : public UMemory
    430 {
    431 public:
    432     CollDataCacheEntry(CollData *theData);
    433     ~CollDataCacheEntry();
    434 
    435     CollData *data;
    436     int32_t   refCount;
    437 };
    438 
    439 CollDataCacheEntry::CollDataCacheEntry(CollData *theData)
    440     : data(theData), refCount(1)
    441 {
    442     // nothing else to do
    443 }
    444 
    445 CollDataCacheEntry::~CollDataCacheEntry()
    446 {
    447     // check refCount?
    448     delete data;
    449 }
    450 
    451 class CollDataCache : public UMemory
    452 {
    453 public:
    454     CollDataCache(UErrorCode &status);
    455     ~CollDataCache();
    456 
    457     CollData *get(UCollator *collator, UErrorCode &status);
    458     void unref(CollData *collData);
    459 
    460     void flush();
    461 
    462 private:
    463     static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength);
    464     static void deleteKey(char *key);
    465 
    466     UMTX lock;
    467     UHashtable *cache;
    468 };
    469 
    470 U_CFUNC void deleteChars(void * /*obj*/)
    471 {
    472     // char *chars = (char *) obj;
    473     // All the key strings are owned by the
    474     // CollData objects and don't need to
    475     // be freed here.
    476   //DELETE_ARRAY(chars);
    477 }
    478 
    479 U_CFUNC void deleteCollDataCacheEntry(void *obj)
    480 {
    481     CollDataCacheEntry *entry = (CollDataCacheEntry *) obj;
    482 
    483     delete entry;
    484 }
    485 
    486 CollDataCache::CollDataCache(UErrorCode &status)
    487     : lock(0), cache(NULL)
    488 {
    489     if (U_FAILURE(status)) {
    490         return;
    491     }
    492 
    493     cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status);
    494 
    495     if (U_FAILURE(status)) {
    496         return;
    497     }
    498 
    499     uhash_setValueDeleter(cache, deleteCollDataCacheEntry);
    500     uhash_setKeyDeleter(cache, deleteChars);
    501 }
    502 
    503 CollDataCache::~CollDataCache()
    504 {
    505     umtx_lock(&lock);
    506     uhash_close(cache);
    507     cache = NULL;
    508     umtx_unlock(&lock);
    509 
    510     umtx_destroy(&lock);
    511 }
    512 
    513 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status)
    514 {
    515     char keyBuffer[KEY_BUFFER_SIZE];
    516     int32_t keyLength = KEY_BUFFER_SIZE;
    517     char *key = getKey(collator, keyBuffer, &keyLength);
    518     CollData *result = NULL, *newData = NULL;
    519     CollDataCacheEntry *entry = NULL, *newEntry = NULL;
    520 
    521     umtx_lock(&lock);
    522     entry = (CollDataCacheEntry *) uhash_get(cache, key);
    523 
    524     if (entry == NULL) {
    525         umtx_unlock(&lock);
    526 
    527         newData = new CollData(collator, key, keyLength, status);
    528         newEntry = new CollDataCacheEntry(newData);
    529 
    530         if (U_FAILURE(status) || newData == NULL || newEntry == NULL) {
    531             status = U_MEMORY_ALLOCATION_ERROR;
    532             return NULL;
    533         }
    534 
    535         umtx_lock(&lock);
    536         entry = (CollDataCacheEntry *) uhash_get(cache, key);
    537 
    538         if (entry == NULL) {
    539             uhash_put(cache, newData->key, newEntry, &status);
    540             umtx_unlock(&lock);
    541 
    542             if (U_FAILURE(status)) {
    543                 delete newEntry;
    544                 delete newData;
    545 
    546                 return NULL;
    547             }
    548 
    549             return newData;
    550         }
    551     }
    552 
    553     result = entry->data;
    554     entry->refCount += 1;
    555     umtx_unlock(&lock);
    556 
    557     if (key != keyBuffer) {
    558         deleteKey(key);
    559     }
    560 
    561     if (newEntry != NULL) {
    562         delete newEntry;
    563         delete newData;
    564     }
    565 
    566     return result;
    567 }
    568 
    569 void CollDataCache::unref(CollData *collData)
    570 {
    571     CollDataCacheEntry *entry = NULL;
    572 
    573     umtx_lock(&lock);
    574     entry = (CollDataCacheEntry *) uhash_get(cache, collData->key);
    575 
    576     if (entry != NULL) {
    577         entry->refCount -= 1;
    578     }
    579     umtx_unlock(&lock);
    580 }
    581 
    582 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength)
    583 {
    584     UErrorCode status = U_ZERO_ERROR;
    585     int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
    586 
    587     if (len >= *keyBufferLength) {
    588         *keyBufferLength = (len + 2) & ~1;  // round to even length, leaving room for terminating null
    589         keyBuffer = NEW_ARRAY(char, *keyBufferLength);
    590         status = U_ZERO_ERROR;
    591 
    592         len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
    593     }
    594 
    595     keyBuffer[len] = '\0';
    596 
    597     return keyBuffer;
    598 }
    599 
    600 void CollDataCache::flush()
    601 {
    602     const UHashElement *element;
    603     int32_t pos = -1;
    604 
    605     umtx_lock(&lock);
    606     while ((element = uhash_nextElement(cache, &pos)) != NULL) {
    607         CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer;
    608 
    609         if (entry->refCount <= 0) {
    610             uhash_removeElement(cache, element);
    611         }
    612     }
    613     umtx_unlock(&lock);
    614 }
    615 
    616 void CollDataCache::deleteKey(char *key)
    617 {
    618     DELETE_ARRAY(key);
    619 }
    620 
    621 U_CDECL_BEGIN
    622 static UBool coll_data_cleanup(void) {
    623     CollData::freeCollDataCache();
    624   return TRUE;
    625 }
    626 U_CDECL_END
    627 
    628 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)
    629 
    630 CollData::CollData()
    631 {
    632     // nothing
    633 }
    634 
    635 #define CLONE_COLLATOR
    636 
    637 //#define CACHE_CELISTS
    638 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status)
    639     : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL)
    640 {
    641     // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
    642     // i.e. other, control, private use, format, surrogate
    643     U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
    644     U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
    645     USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
    646 
    647     // Han ext. A, Han, Jamo, Hangul, Han Ext. B
    648     // i.e. all the characers we handle implicitly
    649     U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    650     U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    651     USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
    652 
    653     if (U_FAILURE(status)) {
    654         return;
    655     }
    656 
    657     USet *expansions   = uset_openEmpty();
    658     USet *contractions = uset_openEmpty();
    659     int32_t itemCount;
    660 
    661 #ifdef CACHE_CELISTS
    662     charsToCEList = new StringToCEsMap(status);
    663 
    664     if (U_FAILURE(status)) {
    665         goto bail;
    666     }
    667 #else
    668     charsToCEList = NULL;
    669 #endif
    670 
    671     ceToCharsStartingWith = new CEToStringsMap(status);
    672 
    673     if (U_FAILURE(status)) {
    674         goto bail;
    675     }
    676 
    677     if (cacheKeyLength > KEY_BUFFER_SIZE) {
    678         key = NEW_ARRAY(char, cacheKeyLength);
    679 
    680         if (key == NULL) {
    681             status = U_MEMORY_ALLOCATION_ERROR;
    682             goto bail;
    683         }
    684     } else {
    685         key = keyBuffer;
    686     }
    687 
    688     ARRAY_COPY(key, cacheKey, cacheKeyLength);
    689 
    690 #ifdef CLONE_COLLATOR
    691     coll = ucol_safeClone(collator, NULL, NULL, &status);
    692 
    693     if (U_FAILURE(status)) {
    694         goto bail;
    695     }
    696 #else
    697     coll = collator;
    698 #endif
    699 
    700     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
    701 
    702     uset_addAll(charsToTest, contractions);
    703     uset_addAll(charsToTest, expansions);
    704     uset_removeAll(charsToTest, charsToRemove);
    705 
    706     itemCount = uset_getItemCount(charsToTest);
    707     for(int32_t item = 0; item < itemCount; item += 1) {
    708         UChar32 start = 0, end = 0;
    709         UChar buffer[16];
    710         int32_t len = uset_getItem(charsToTest, item, &start, &end,
    711                                    buffer, 16, &status);
    712 
    713         if (len == 0) {
    714             for (UChar32 ch = start; ch <= end; ch += 1) {
    715                 UnicodeString *st = new UnicodeString(ch);
    716 
    717                 if (st == NULL) {
    718                     status = U_MEMORY_ALLOCATION_ERROR;
    719                     break;
    720                 }
    721 
    722                 CEList *ceList = new CEList(coll, *st, status);
    723 
    724                 ceToCharsStartingWith->put(ceList->get(0), st, status);
    725 
    726 #ifdef CACHE_CELISTS
    727                 charsToCEList->put(st, ceList, status);
    728 #else
    729                 delete ceList;
    730                 delete st;
    731 #endif
    732             }
    733         } else if (len > 0) {
    734             UnicodeString *st = new UnicodeString(buffer, len);
    735 
    736             if (st == NULL) {
    737                 status = U_MEMORY_ALLOCATION_ERROR;
    738                 break;
    739             }
    740 
    741             CEList *ceList = new CEList(coll, *st, status);
    742 
    743             ceToCharsStartingWith->put(ceList->get(0), st, status);
    744 
    745 #ifdef CACHE_CELISTS
    746             charsToCEList->put(st, ceList, status);
    747 #else
    748             delete ceList;
    749             delete st;
    750 #endif
    751         } else {
    752             // shouldn't happen...
    753         }
    754 
    755         if (U_FAILURE(status)) {
    756              break;
    757         }
    758     }
    759 
    760 bail:
    761     uset_close(contractions);
    762     uset_close(expansions);
    763     uset_close(charsToRemove);
    764     uset_close(charsToTest);
    765 
    766     if (U_FAILURE(status)) {
    767         return;
    768     }
    769 
    770      UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
    771                             UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
    772      UChar  jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
    773      UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
    774      UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
    775      CEList hanList(coll, hanString, status);
    776      CEList jamoList(coll, jamoString, status);
    777      int32_t j = 0;
    778 
    779      if (U_FAILURE(status)) {
    780          return;
    781      }
    782 
    783      for (int32_t c = 0; c < jamoList.size(); c += 1) {
    784          uint32_t jce = jamoList[c];
    785 
    786          if (! isContinuation(jce)) {
    787              jamoLimits[j++] = jce;
    788          }
    789      }
    790 
    791      jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
    792 
    793      minHan = 0xFFFFFFFF;
    794      maxHan = 0;
    795 
    796      for(int32_t h = 0; h < hanList.size(); h += 2) {
    797          uint32_t han = (uint32_t) hanList[h];
    798 
    799          if (han < minHan) {
    800              minHan = han;
    801          }
    802 
    803          if (han > maxHan) {
    804              maxHan = han;
    805          }
    806      }
    807 
    808      maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
    809 }
    810 
    811 CollData::~CollData()
    812 {
    813 #ifdef CLONE_COLLATOR
    814    ucol_close(coll);
    815 #endif
    816 
    817    if (key != keyBuffer) {
    818        DELETE_ARRAY(key);
    819    }
    820 
    821    delete ceToCharsStartingWith;
    822 
    823 #ifdef CACHE_CELISTS
    824    delete charsToCEList;
    825 #endif
    826 }
    827 
    828 UCollator *CollData::getCollator() const
    829 {
    830     return coll;
    831 }
    832 
    833 const StringList *CollData::getStringList(int32_t ce) const
    834 {
    835     return ceToCharsStartingWith->getStringList(ce);
    836 }
    837 
    838 const CEList *CollData::getCEList(const UnicodeString *string) const
    839 {
    840 #ifdef CACHE_CELISTS
    841     return charsToCEList->get(string);
    842 #else
    843     UErrorCode status = U_ZERO_ERROR;
    844     const CEList *list = new CEList(coll, *string, status);
    845 
    846     if (U_FAILURE(status)) {
    847         delete list;
    848         list = NULL;
    849     }
    850 
    851     return list;
    852 #endif
    853 }
    854 
    855 void CollData::freeCEList(const CEList *list)
    856 {
    857 #ifndef CACHE_CELISTS
    858     delete list;
    859 #endif
    860 }
    861 
    862 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
    863 {
    864     // find out shortest string for the longest sequence of ces.
    865     // this can probably be folded with the minLengthCache...
    866 
    867     if (history[offset] >= 0) {
    868         return history[offset];
    869     }
    870 
    871     uint32_t ce = ceList->get(offset);
    872     int32_t maxOffset = ceList->size();
    873     int32_t shortestLength = INT32_MAX;
    874     const StringList *strings = ceToCharsStartingWith->getStringList(ce);
    875 
    876     if (strings != NULL) {
    877         int32_t stringCount = strings->size();
    878 
    879         for (int32_t s = 0; s < stringCount; s += 1) {
    880             const UnicodeString *string = strings->get(s);
    881 #ifdef CACHE_CELISTS
    882             const CEList *ceList2 = charsToCEList->get(string);
    883 #else
    884             UErrorCode status = U_ZERO_ERROR;
    885             const CEList *ceList2 = new CEList(coll, *string, status);
    886 
    887             if (U_FAILURE(status)) {
    888                 delete ceList2;
    889                 ceList2 = NULL;
    890             }
    891 #endif
    892 
    893             if (ceList->matchesAt(offset, ceList2)) {
    894                 int32_t clength = ceList2->size();
    895                 int32_t slength = string->length();
    896                 int32_t roffset = offset + clength;
    897                 int32_t rlength = 0;
    898 
    899                 if (roffset < maxOffset) {
    900                     rlength = minLengthInChars(ceList, roffset, history);
    901 
    902                     if (rlength <= 0) {
    903                     // delete before continue to avoid memory leak.
    904 #ifndef CACHE_CELISTS
    905                         delete ceList2;
    906 #endif
    907                         // ignore any dead ends
    908                         continue;
    909                     }
    910                 }
    911 
    912                 if (shortestLength > slength + rlength) {
    913                     shortestLength = slength + rlength;
    914                 }
    915             }
    916 
    917 #ifndef CACHE_CELISTS
    918             delete ceList2;
    919 #endif
    920         }
    921     }
    922 
    923     if (shortestLength == INT32_MAX) {
    924         // No matching strings at this offset. See if
    925         // the CE is in a range we can handle manually.
    926         if (ce >= minHan && ce < maxHan) {
    927             // all han have implicit orders which
    928             // generate two CEs.
    929             int32_t roffset = offset + 2;
    930             int32_t rlength = 0;
    931 
    932           //history[roffset++] = -1;
    933           //history[roffset++] = 1;
    934 
    935             if (roffset < maxOffset) {
    936                 rlength = minLengthInChars(ceList, roffset, history);
    937             }
    938 
    939             if (rlength < 0) {
    940                 return -1;
    941             }
    942 
    943             shortestLength = 1 + rlength;
    944             goto have_shortest;
    945         } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
    946             int32_t roffset = offset;
    947             int32_t rlength = 0;
    948 
    949             // **** this loop may not handle archaic Hangul correctly ****
    950             for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
    951                 uint32_t jce = ceList->get(roffset);
    952 
    953                 // Some Jamo have 24-bit primary order; skip the
    954                 // 2nd CE. This should always be OK because if
    955                 // we're still in the loop all we've seen are
    956                 // a series of Jamo in LVT order.
    957                 if (isContinuation(jce)) {
    958                     continue;
    959                 }
    960 
    961                 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
    962                     break;
    963                 }
    964             }
    965 
    966             if (roffset == offset) {
    967                 // we started with a non-L Jamo...
    968                 // just say it comes from a single character
    969                 roffset += 1;
    970 
    971                 // See if the single Jamo has a 24-bit order.
    972                 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
    973                     roffset += 1;
    974                 }
    975             }
    976 
    977             if (roffset < maxOffset) {
    978                 rlength = minLengthInChars(ceList, roffset, history);
    979             }
    980 
    981             if (rlength < 0) {
    982                 return -1;
    983             }
    984 
    985             shortestLength = 1 + rlength;
    986             goto have_shortest;
    987         }
    988 
    989         // Can't handle it manually either. Just move on.
    990         return -1;
    991     }
    992 
    993 have_shortest:
    994     history[offset] = shortestLength;
    995 
    996     return shortestLength;
    997 }
    998 
    999 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
   1000 {
   1001     int32_t clength = ceList->size();
   1002     int32_t *history = NEW_ARRAY(int32_t, clength);
   1003 
   1004     for (int32_t i = 0; i < clength; i += 1) {
   1005         history[i] = -1;
   1006     }
   1007 
   1008     int32_t minLength = minLengthInChars(ceList, offset, history);
   1009 
   1010     DELETE_ARRAY(history);
   1011 
   1012     return minLength;
   1013 }
   1014 
   1015 CollData *CollData::open(UCollator *collator, UErrorCode &status)
   1016 {
   1017     if (U_FAILURE(status)) {
   1018         return NULL;
   1019     }
   1020 
   1021     CollDataCache *cache = getCollDataCache();
   1022 
   1023     return cache->get(collator, status);
   1024 }
   1025 
   1026 void CollData::close(CollData *collData)
   1027 {
   1028     CollDataCache *cache = getCollDataCache();
   1029 
   1030     cache->unref(collData);
   1031 }
   1032 
   1033 CollDataCache *CollData::collDataCache = NULL;
   1034 
   1035 CollDataCache *CollData::getCollDataCache()
   1036 {
   1037     UErrorCode status = U_ZERO_ERROR;
   1038     CollDataCache *cache = NULL;
   1039 
   1040     UMTX_CHECK(NULL, collDataCache, cache);
   1041 
   1042     if (cache == NULL) {
   1043         cache = new CollDataCache(status);
   1044 
   1045         if (U_FAILURE(status)) {
   1046             delete cache;
   1047             return NULL;
   1048         }
   1049 
   1050         umtx_lock(NULL);
   1051         if (collDataCache == NULL) {
   1052             collDataCache = cache;
   1053 
   1054             ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup);
   1055         }
   1056         umtx_unlock(NULL);
   1057 
   1058         if (collDataCache != cache) {
   1059             delete cache;
   1060         }
   1061     }
   1062 
   1063     return collDataCache;
   1064 }
   1065 
   1066 void CollData::freeCollDataCache()
   1067 {
   1068     CollDataCache *cache = NULL;
   1069 
   1070     UMTX_CHECK(NULL, collDataCache, cache);
   1071 
   1072     if (cache != NULL) {
   1073         umtx_lock(NULL);
   1074         if (collDataCache != NULL) {
   1075             collDataCache = NULL;
   1076         } else {
   1077             cache = NULL;
   1078         }
   1079         umtx_unlock(NULL);
   1080 
   1081         delete cache;
   1082     }
   1083 }
   1084 
   1085 void CollData::flushCollDataCache()
   1086 {
   1087     CollDataCache *cache = NULL;
   1088 
   1089     UMTX_CHECK(NULL, collDataCache, cache);
   1090 
   1091     // **** this will fail if the another ****
   1092     // **** thread deletes the cache here ****
   1093     if (cache != NULL) {
   1094         cache->flush();
   1095     }
   1096 }
   1097 
   1098 U_NAMESPACE_END
   1099 
   1100 #endif // #if !UCONFIG_NO_COLLATION
   1101