Home | History | Annotate | Download | only in i18n
      1 /*
      2  ******************************************************************************
      3  *   Copyright (C) 1996-2011, International Business Machines                 *
      4  *   Corporation and others.  All Rights Reserved.                            *
      5  ******************************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_COLLATION
     11 
     12 #include "unicode/unistr.h"
     13 #include "unicode/putil.h"
     14 #include "unicode/usearch.h"
     15 
     16 #include "cmemory.h"
     17 #include "unicode/coll.h"
     18 #include "unicode/tblcoll.h"
     19 #include "unicode/coleitr.h"
     20 #include "unicode/ucoleitr.h"
     21 
     22 #include "unicode/regex.h"        // TODO: make conditional on regexp being built.
     23 
     24 #include "unicode/uniset.h"
     25 #include "unicode/uset.h"
     26 #include "unicode/ustring.h"
     27 #include "hash.h"
     28 #include "uhash.h"
     29 #include "ucln_in.h"
     30 #include "ucol_imp.h"
     31 #include "umutex.h"
     32 
     33 #include "unicode/colldata.h"
     34 
     35 U_NAMESPACE_BEGIN
     36 
     37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
     38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
     39 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     40 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0])
     41 
     42 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList)
     43 
     44 #ifdef INSTRUMENT_CELIST
     45 int32_t CEList::_active = 0;
     46 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     47 #endif
     48 
     49 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
     50     : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
     51 {
     52     UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
     53     UCollationStrength strength = ucol_getStrength(coll);
     54     UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
     55     uint32_t variableTop = ucol_getVariableTop(coll, &status);
     56     uint32_t strengthMask = 0;
     57     int32_t order;
     58 
     59     if (U_FAILURE(status)) {
     60         return;
     61     }
     62 
     63     // **** only set flag if string has Han(gul) ****
     64     ucol_forceHanImplicit(elems, &status);
     65 
     66     switch (strength)
     67     {
     68     default:
     69         strengthMask |= UCOL_TERTIARYORDERMASK;
     70         /* fall through */
     71 
     72     case UCOL_SECONDARY:
     73         strengthMask |= UCOL_SECONDARYORDERMASK;
     74         /* fall through */
     75 
     76     case UCOL_PRIMARY:
     77         strengthMask |= UCOL_PRIMARYORDERMASK;
     78     }
     79 
     80 #ifdef INSTRUMENT_CELIST
     81     _active += 1;
     82     _histogram[0] += 1;
     83 #endif
     84 
     85     ces = ceBuffer;
     86 
     87     while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
     88         UBool cont = isContinuation(order);
     89 
     90         order &= strengthMask;
     91 
     92         if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
     93             if (strength >= UCOL_QUATERNARY) {
     94                 order &= UCOL_PRIMARYORDERMASK;
     95             } else {
     96                 order = UCOL_IGNORABLE;
     97             }
     98         }
     99 
    100         if (order == UCOL_IGNORABLE) {
    101             continue;
    102         }
    103 
    104         if (cont) {
    105             order |= UCOL_CONTINUATION_MARKER;
    106         }
    107 
    108         add(order, status);
    109     }
    110 
    111     ucol_closeElements(elems);
    112 }
    113 
    114 CEList::~CEList()
    115 {
    116 #ifdef INSTRUMENT_CELIST
    117     _active -= 1;
    118 #endif
    119 
    120     if (ces != ceBuffer) {
    121         DELETE_ARRAY(ces);
    122     }
    123 }
    124 
    125 void CEList::add(uint32_t ce, UErrorCode &status)
    126 {
    127     if (U_FAILURE(status)) {
    128         return;
    129     }
    130 
    131     if (listSize >= listMax) {
    132         int32_t newMax = listMax + CELIST_BUFFER_SIZE;
    133 
    134 #ifdef INSTRUMENT_CELIST
    135         _histogram[listSize / CELIST_BUFFER_SIZE] += 1;
    136 #endif
    137 
    138         uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax);
    139 
    140         if (newCEs == NULL) {
    141             status = U_MEMORY_ALLOCATION_ERROR;
    142             return;
    143         }
    144 
    145         uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t));
    146 
    147         if (ces != ceBuffer) {
    148             DELETE_ARRAY(ces);
    149         }
    150 
    151         ces = newCEs;
    152         listMax = newMax;
    153     }
    154 
    155     ces[listSize++] = ce;
    156 }
    157 
    158 uint32_t CEList::get(int32_t index) const
    159 {
    160     if (index >= 0 && index < listSize) {
    161         return ces[index];
    162     }
    163 
    164     return UCOL_NULLORDER;
    165 }
    166 
    167 uint32_t &CEList::operator[](int32_t index) const
    168 {
    169     return ces[index];
    170 }
    171 
    172 UBool CEList::matchesAt(int32_t offset, const CEList *other) const
    173 {
    174     if (other == NULL || listSize - offset < other->size()) {
    175         return FALSE;
    176     }
    177 
    178     for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) {
    179         if (ces[i] != (*other)[j]) {
    180             return FALSE;
    181         }
    182     }
    183 
    184     return TRUE;
    185 }
    186 
    187 int32_t CEList::size() const
    188 {
    189     return listSize;
    190 }
    191 
    192 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList)
    193 
    194 #ifdef INSTRUMENT_STRING_LIST
    195 int32_t StringList::_lists = 0;
    196 int32_t StringList::_strings = 0;
    197 int32_t StringList::_histogram[101] = {0};
    198 #endif
    199 
    200 StringList::StringList(UErrorCode &status)
    201     : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0)
    202 {
    203     if (U_FAILURE(status)) {
    204         return;
    205     }
    206 
    207     strings = new UnicodeString [listMax];
    208 
    209     if (strings == NULL) {
    210         status = U_MEMORY_ALLOCATION_ERROR;
    211         return;
    212     }
    213 
    214 #ifdef INSTRUMENT_STRING_LIST
    215     _lists += 1;
    216     _histogram[0] += 1;
    217 #endif
    218 }
    219 
    220 StringList::~StringList()
    221 {
    222     delete[] strings;
    223 }
    224 
    225 void StringList::add(const UnicodeString *string, UErrorCode &status)
    226 {
    227     if (U_FAILURE(status)) {
    228         return;
    229     }
    230 
    231 #ifdef INSTRUMENT_STRING_LIST
    232     _strings += 1;
    233 #endif
    234 
    235     if (listSize >= listMax) {
    236         int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE;
    237         UnicodeString *newStrings = new UnicodeString[newMax];
    238         if (newStrings == NULL) {
    239             status = U_MEMORY_ALLOCATION_ERROR;
    240             return;
    241         }
    242         for (int32_t i=0; i<listSize; ++i) {
    243             newStrings[i] = strings[i];
    244         }
    245 
    246 #ifdef INSTRUMENT_STRING_LIST
    247         int32_t _h = listSize / STRING_LIST_BUFFER_SIZE;
    248 
    249         if (_h > 100) {
    250             _h = 100;
    251         }
    252 
    253         _histogram[_h] += 1;
    254 #endif
    255 
    256         delete[] strings;
    257         strings = newStrings;
    258         listMax = newMax;
    259     }
    260 
    261     // The ctor initialized all the strings in
    262     // the array to empty strings, so this
    263     // is the same as copying the source string.
    264     strings[listSize++].append(*string);
    265 }
    266 
    267 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status)
    268 {
    269     const UnicodeString string(chars, count);
    270 
    271     add(&string, status);
    272 }
    273 
    274 const UnicodeString *StringList::get(int32_t index) const
    275 {
    276     if (index >= 0 && index < listSize) {
    277         return &strings[index];
    278     }
    279 
    280     return NULL;
    281 }
    282 
    283 int32_t StringList::size() const
    284 {
    285     return listSize;
    286 }
    287 
    288 
    289 U_CFUNC void deleteStringList(void *obj);
    290 
    291 class CEToStringsMap : public UMemory
    292 {
    293 public:
    294 
    295     CEToStringsMap(UErrorCode &status);
    296     ~CEToStringsMap();
    297 
    298     void put(uint32_t ce, UnicodeString *string, UErrorCode &status);
    299     StringList *getStringList(uint32_t ce) const;
    300 
    301 private:
    302 
    303     void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status);
    304     UHashtable *map;
    305 };
    306 
    307 CEToStringsMap::CEToStringsMap(UErrorCode &status)
    308     : map(NULL)
    309 {
    310     if (U_FAILURE(status)) {
    311         return;
    312     }
    313 
    314     map = uhash_open(uhash_hashLong, uhash_compareLong,
    315                      uhash_compareCaselessUnicodeString,
    316                      &status);
    317 
    318     if (U_FAILURE(status)) {
    319         return;
    320     }
    321 
    322     uhash_setValueDeleter(map, deleteStringList);
    323 }
    324 
    325 CEToStringsMap::~CEToStringsMap()
    326 {
    327     uhash_close(map);
    328 }
    329 
    330 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status)
    331 {
    332     StringList *strings = getStringList(ce);
    333 
    334     if (strings == NULL) {
    335         strings = new StringList(status);
    336 
    337         if (strings == NULL || U_FAILURE(status)) {
    338             status = U_MEMORY_ALLOCATION_ERROR;
    339             return;
    340         }
    341 
    342         putStringList(ce, strings, status);
    343     }
    344 
    345     strings->add(string, status);
    346 }
    347 
    348 StringList *CEToStringsMap::getStringList(uint32_t ce) const
    349 {
    350     return (StringList *) uhash_iget(map, ce);
    351 }
    352 
    353 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status)
    354 {
    355     uhash_iput(map, ce, (void *) stringList, &status);
    356 }
    357 
    358 U_CFUNC void deleteStringList(void *obj)
    359 {
    360     StringList *strings = (StringList *) obj;
    361 
    362     delete strings;
    363 }
    364 
    365 U_CFUNC void deleteCEList(void *obj);
    366 U_CFUNC void deleteUnicodeStringKey(void *obj);
    367 
    368 class StringToCEsMap : public UMemory
    369 {
    370 public:
    371     StringToCEsMap(UErrorCode &status);
    372     ~StringToCEsMap();
    373 
    374     void put(const UnicodeString *string, const CEList *ces, UErrorCode &status);
    375     const CEList *get(const UnicodeString *string);
    376     void free(const CEList *list);
    377 
    378 private:
    379 
    380 
    381     UHashtable *map;
    382 };
    383 
    384 StringToCEsMap::StringToCEsMap(UErrorCode &status)
    385     : map(NULL)
    386 {
    387     if (U_FAILURE(status)) {
    388         return;
    389     }
    390 
    391     map = uhash_open(uhash_hashUnicodeString,
    392                      uhash_compareUnicodeString,
    393                      uhash_compareLong,
    394                      &status);
    395 
    396     if (U_FAILURE(status)) {
    397         return;
    398     }
    399 
    400     uhash_setValueDeleter(map, deleteCEList);
    401     uhash_setKeyDeleter(map, deleteUnicodeStringKey);
    402 }
    403 
    404 StringToCEsMap::~StringToCEsMap()
    405 {
    406     uhash_close(map);
    407 }
    408 
    409 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status)
    410 {
    411     uhash_put(map, (void *) string, (void *) ces, &status);
    412 }
    413 
    414 const CEList *StringToCEsMap::get(const UnicodeString *string)
    415 {
    416     return (const CEList *) uhash_get(map, string);
    417 }
    418 
    419 U_CFUNC void deleteCEList(void *obj)
    420 {
    421     CEList *list = (CEList *) obj;
    422 
    423     delete list;
    424 }
    425 
    426 U_CFUNC void deleteUnicodeStringKey(void *obj)
    427 {
    428     UnicodeString *key = (UnicodeString *) obj;
    429 
    430     delete key;
    431 }
    432 
    433 class CollDataCacheEntry : public UMemory
    434 {
    435 public:
    436     CollDataCacheEntry(CollData *theData);
    437     ~CollDataCacheEntry();
    438 
    439     CollData *data;
    440     int32_t   refCount;
    441 };
    442 
    443 CollDataCacheEntry::CollDataCacheEntry(CollData *theData)
    444     : data(theData), refCount(1)
    445 {
    446     // nothing else to do
    447 }
    448 
    449 CollDataCacheEntry::~CollDataCacheEntry()
    450 {
    451     // check refCount?
    452     delete data;
    453 }
    454 
    455 class CollDataCache : public UMemory
    456 {
    457 public:
    458     CollDataCache(UErrorCode &status);
    459     ~CollDataCache();
    460 
    461     CollData *get(UCollator *collator, UErrorCode &status);
    462     void unref(CollData *collData);
    463 
    464     void flush();
    465 
    466 private:
    467     static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength);
    468     static void deleteKey(char *key);
    469 
    470     UHashtable *cache;
    471 };
    472 static UMTX lock;
    473 
    474 U_CFUNC void deleteChars(void * /*obj*/)
    475 {
    476     // char *chars = (char *) obj;
    477     // All the key strings are owned by the
    478     // CollData objects and don't need to
    479     // be freed here.
    480   //DELETE_ARRAY(chars);
    481 }
    482 
    483 U_CFUNC void deleteCollDataCacheEntry(void *obj)
    484 {
    485     CollDataCacheEntry *entry = (CollDataCacheEntry *) obj;
    486 
    487     delete entry;
    488 }
    489 
    490 CollDataCache::CollDataCache(UErrorCode &status)
    491     : cache(NULL)
    492 {
    493     if (U_FAILURE(status)) {
    494         return;
    495     }
    496 
    497     cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status);
    498 
    499     if (U_FAILURE(status)) {
    500         return;
    501     }
    502 
    503     uhash_setValueDeleter(cache, deleteCollDataCacheEntry);
    504     uhash_setKeyDeleter(cache, deleteChars);
    505 }
    506 
    507 CollDataCache::~CollDataCache()
    508 {
    509     umtx_lock(&lock);
    510     uhash_close(cache);
    511     cache = NULL;
    512     umtx_unlock(&lock);
    513 }
    514 
    515 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status)
    516 {
    517     char keyBuffer[KEY_BUFFER_SIZE];
    518     int32_t keyLength = KEY_BUFFER_SIZE;
    519     char *key = getKey(collator, keyBuffer, &keyLength);
    520     CollData *result = NULL, *newData = NULL;
    521     CollDataCacheEntry *entry = NULL, *newEntry = NULL;
    522 
    523     umtx_lock(&lock);
    524     entry = (CollDataCacheEntry *) uhash_get(cache, key);
    525 
    526     if (entry == NULL) {
    527         umtx_unlock(&lock);
    528 
    529         newData = new CollData(collator, key, keyLength, status);
    530         newEntry = new CollDataCacheEntry(newData);
    531 
    532         if (U_FAILURE(status) || newData == NULL || newEntry == NULL) {
    533             status = U_MEMORY_ALLOCATION_ERROR;
    534             return NULL;
    535         }
    536 
    537         umtx_lock(&lock);
    538         entry = (CollDataCacheEntry *) uhash_get(cache, key);
    539 
    540         if (entry == NULL) {
    541             uhash_put(cache, newData->key, newEntry, &status);
    542             umtx_unlock(&lock);
    543 
    544             if (U_FAILURE(status)) {
    545                 delete newEntry;
    546                 delete newData;
    547 
    548                 return NULL;
    549             }
    550 
    551             return newData;
    552         }
    553     }
    554 
    555     result = entry->data;
    556     entry->refCount += 1;
    557     umtx_unlock(&lock);
    558 
    559     if (key != keyBuffer) {
    560         deleteKey(key);
    561     }
    562 
    563     if (newEntry != NULL) {
    564         delete newEntry;
    565         delete newData;
    566     }
    567 
    568     return result;
    569 }
    570 
    571 void CollDataCache::unref(CollData *collData)
    572 {
    573     CollDataCacheEntry *entry = NULL;
    574 
    575     umtx_lock(&lock);
    576     entry = (CollDataCacheEntry *) uhash_get(cache, collData->key);
    577 
    578     if (entry != NULL) {
    579         entry->refCount -= 1;
    580     }
    581     umtx_unlock(&lock);
    582 }
    583 
    584 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength)
    585 {
    586     UErrorCode status = U_ZERO_ERROR;
    587     int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
    588 
    589     if (len >= *keyBufferLength) {
    590         *keyBufferLength = (len + 2) & ~1;  // round to even length, leaving room for terminating null
    591         keyBuffer = NEW_ARRAY(char, *keyBufferLength);
    592         status = U_ZERO_ERROR;
    593 
    594         len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status);
    595     }
    596 
    597     keyBuffer[len] = '\0';
    598 
    599     return keyBuffer;
    600 }
    601 
    602 void CollDataCache::flush()
    603 {
    604     const UHashElement *element;
    605     int32_t pos = -1;
    606 
    607     umtx_lock(&lock);
    608     while ((element = uhash_nextElement(cache, &pos)) != NULL) {
    609         CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer;
    610 
    611         if (entry->refCount <= 0) {
    612             uhash_removeElement(cache, element);
    613         }
    614     }
    615     umtx_unlock(&lock);
    616 }
    617 
    618 void CollDataCache::deleteKey(char *key)
    619 {
    620     DELETE_ARRAY(key);
    621 }
    622 
    623 U_CDECL_BEGIN
    624 static UBool coll_data_cleanup(void) {
    625     CollData::freeCollDataCache();
    626   return TRUE;
    627 }
    628 U_CDECL_END
    629 
    630 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData)
    631 
    632 CollData::CollData()
    633 {
    634     // nothing
    635 }
    636 
    637 #define CLONE_COLLATOR
    638 
    639 //#define CACHE_CELISTS
    640 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status)
    641     : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL)
    642 {
    643     // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
    644     // i.e. other, control, private use, format, surrogate
    645     U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
    646     U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
    647     USet *charsToTest = uset_openPattern(test_pattern, 20, &status);
    648 
    649     // Han ext. A, Han, Jamo, Hangul, Han Ext. B
    650     // i.e. all the characers we handle implicitly
    651     U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    652     U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    653     USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);
    654 
    655     if (U_FAILURE(status)) {
    656         return;
    657     }
    658 
    659     USet *expansions   = uset_openEmpty();
    660     USet *contractions = uset_openEmpty();
    661     int32_t itemCount;
    662 
    663 #ifdef CACHE_CELISTS
    664     charsToCEList = new StringToCEsMap(status);
    665 
    666     if (U_FAILURE(status)) {
    667         goto bail;
    668     }
    669 #else
    670     charsToCEList = NULL;
    671 #endif
    672 
    673     ceToCharsStartingWith = new CEToStringsMap(status);
    674 
    675     if (U_FAILURE(status)) {
    676         goto bail;
    677     }
    678 
    679     if (cacheKeyLength > KEY_BUFFER_SIZE) {
    680         key = NEW_ARRAY(char, cacheKeyLength);
    681 
    682         if (key == NULL) {
    683             status = U_MEMORY_ALLOCATION_ERROR;
    684             goto bail;
    685         }
    686     } else {
    687         key = keyBuffer;
    688     }
    689 
    690     ARRAY_COPY(key, cacheKey, cacheKeyLength);
    691 
    692 #ifdef CLONE_COLLATOR
    693     coll = ucol_safeClone(collator, NULL, NULL, &status);
    694 
    695     if (U_FAILURE(status)) {
    696         goto bail;
    697     }
    698 #else
    699     coll = collator;
    700 #endif
    701 
    702     ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
    703 
    704     uset_addAll(charsToTest, contractions);
    705     uset_addAll(charsToTest, expansions);
    706     uset_removeAll(charsToTest, charsToRemove);
    707 
    708     itemCount = uset_getItemCount(charsToTest);
    709     for(int32_t item = 0; item < itemCount; item += 1) {
    710         UChar32 start = 0, end = 0;
    711         UChar buffer[16];
    712         int32_t len = uset_getItem(charsToTest, item, &start, &end,
    713                                    buffer, 16, &status);
    714 
    715         if (len == 0) {
    716             for (UChar32 ch = start; ch <= end; ch += 1) {
    717                 UnicodeString *st = new UnicodeString(ch);
    718 
    719                 if (st == NULL) {
    720                     status = U_MEMORY_ALLOCATION_ERROR;
    721                     break;
    722                 }
    723 
    724                 CEList *ceList = new CEList(coll, *st, status);
    725 
    726                 ceToCharsStartingWith->put(ceList->get(0), st, status);
    727 
    728 #ifdef CACHE_CELISTS
    729                 charsToCEList->put(st, ceList, status);
    730 #else
    731                 delete ceList;
    732                 delete st;
    733 #endif
    734             }
    735         } else if (len > 0) {
    736             UnicodeString *st = new UnicodeString(buffer, len);
    737 
    738             if (st == NULL) {
    739                 status = U_MEMORY_ALLOCATION_ERROR;
    740                 break;
    741             }
    742 
    743             CEList *ceList = new CEList(coll, *st, status);
    744 
    745             ceToCharsStartingWith->put(ceList->get(0), st, status);
    746 
    747 #ifdef CACHE_CELISTS
    748             charsToCEList->put(st, ceList, status);
    749 #else
    750             delete ceList;
    751             delete st;
    752 #endif
    753         } else {
    754             // shouldn't happen...
    755         }
    756 
    757         if (U_FAILURE(status)) {
    758              break;
    759         }
    760     }
    761 
    762 bail:
    763     uset_close(contractions);
    764     uset_close(expansions);
    765     uset_close(charsToRemove);
    766     uset_close(charsToTest);
    767 
    768     if (U_FAILURE(status)) {
    769         return;
    770     }
    771 
    772      UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A,
    773                             UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B};
    774      UChar  jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO};
    775      UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges));
    776      UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges));
    777      CEList hanList(coll, hanString, status);
    778      CEList jamoList(coll, jamoString, status);
    779      int32_t j = 0;
    780 
    781      if (U_FAILURE(status)) {
    782          return;
    783      }
    784 
    785      for (int32_t c = 0; c < jamoList.size(); c += 1) {
    786          uint32_t jce = jamoList[c];
    787 
    788          if (! isContinuation(jce)) {
    789              jamoLimits[j++] = jce;
    790          }
    791      }
    792 
    793      jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);
    794 
    795      minHan = 0xFFFFFFFF;
    796      maxHan = 0;
    797 
    798      for(int32_t h = 0; h < hanList.size(); h += 2) {
    799          uint32_t han = (uint32_t) hanList[h];
    800 
    801          if (han < minHan) {
    802              minHan = han;
    803          }
    804 
    805          if (han > maxHan) {
    806              maxHan = han;
    807          }
    808      }
    809 
    810      maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
    811 }
    812 
    813 CollData::~CollData()
    814 {
    815 #ifdef CLONE_COLLATOR
    816    ucol_close(coll);
    817 #endif
    818 
    819    if (key != keyBuffer) {
    820        DELETE_ARRAY(key);
    821    }
    822 
    823    delete ceToCharsStartingWith;
    824 
    825 #ifdef CACHE_CELISTS
    826    delete charsToCEList;
    827 #endif
    828 }
    829 
    830 UCollator *CollData::getCollator() const
    831 {
    832     return coll;
    833 }
    834 
    835 const StringList *CollData::getStringList(int32_t ce) const
    836 {
    837     return ceToCharsStartingWith->getStringList(ce);
    838 }
    839 
    840 const CEList *CollData::getCEList(const UnicodeString *string) const
    841 {
    842 #ifdef CACHE_CELISTS
    843     return charsToCEList->get(string);
    844 #else
    845     UErrorCode status = U_ZERO_ERROR;
    846     const CEList *list = new CEList(coll, *string, status);
    847 
    848     if (U_FAILURE(status)) {
    849         delete list;
    850         list = NULL;
    851     }
    852 
    853     return list;
    854 #endif
    855 }
    856 
    857 void CollData::freeCEList(const CEList *list)
    858 {
    859 #ifndef CACHE_CELISTS
    860     delete list;
    861 #endif
    862 }
    863 
    864 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const
    865 {
    866     // find out shortest string for the longest sequence of ces.
    867     // this can probably be folded with the minLengthCache...
    868 
    869     if (history[offset] >= 0) {
    870         return history[offset];
    871     }
    872 
    873     uint32_t ce = ceList->get(offset);
    874     int32_t maxOffset = ceList->size();
    875     int32_t shortestLength = INT32_MAX;
    876     const StringList *strings = ceToCharsStartingWith->getStringList(ce);
    877 
    878     if (strings != NULL) {
    879         int32_t stringCount = strings->size();
    880 
    881         for (int32_t s = 0; s < stringCount; s += 1) {
    882             const UnicodeString *string = strings->get(s);
    883 #ifdef CACHE_CELISTS
    884             const CEList *ceList2 = charsToCEList->get(string);
    885 #else
    886             UErrorCode status = U_ZERO_ERROR;
    887             const CEList *ceList2 = new CEList(coll, *string, status);
    888 
    889             if (U_FAILURE(status)) {
    890                 delete ceList2;
    891                 ceList2 = NULL;
    892             }
    893 #endif
    894 
    895             if (ceList->matchesAt(offset, ceList2)) {
    896                 int32_t clength = ceList2->size();
    897                 int32_t slength = string->length();
    898                 int32_t roffset = offset + clength;
    899                 int32_t rlength = 0;
    900 
    901                 if (roffset < maxOffset) {
    902                     rlength = minLengthInChars(ceList, roffset, history);
    903 
    904                     if (rlength <= 0) {
    905                     // delete before continue to avoid memory leak.
    906 #ifndef CACHE_CELISTS
    907                         delete ceList2;
    908 #endif
    909                         // ignore any dead ends
    910                         continue;
    911                     }
    912                 }
    913 
    914                 if (shortestLength > slength + rlength) {
    915                     shortestLength = slength + rlength;
    916                 }
    917             }
    918 
    919 #ifndef CACHE_CELISTS
    920             delete ceList2;
    921 #endif
    922         }
    923     }
    924 
    925     if (shortestLength == INT32_MAX) {
    926         // No matching strings at this offset. See if
    927         // the CE is in a range we can handle manually.
    928         if (ce >= minHan && ce < maxHan) {
    929             // all han have implicit orders which
    930             // generate two CEs.
    931             int32_t roffset = offset + 2;
    932             int32_t rlength = 0;
    933 
    934           //history[roffset++] = -1;
    935           //history[roffset++] = 1;
    936 
    937             if (roffset < maxOffset) {
    938                 rlength = minLengthInChars(ceList, roffset, history);
    939             }
    940 
    941             if (rlength < 0) {
    942                 return -1;
    943             }
    944 
    945             shortestLength = 1 + rlength;
    946             goto have_shortest;
    947         } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) {
    948             int32_t roffset = offset;
    949             int32_t rlength = 0;
    950 
    951             // **** this loop may not handle archaic Hangul correctly ****
    952             for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) {
    953                 uint32_t jce = ceList->get(roffset);
    954 
    955                 // Some Jamo have 24-bit primary order; skip the
    956                 // 2nd CE. This should always be OK because if
    957                 // we're still in the loop all we've seen are
    958                 // a series of Jamo in LVT order.
    959                 if (isContinuation(jce)) {
    960                     continue;
    961                 }
    962 
    963                 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) {
    964                     break;
    965                 }
    966             }
    967 
    968             if (roffset == offset) {
    969                 // we started with a non-L Jamo...
    970                 // just say it comes from a single character
    971                 roffset += 1;
    972 
    973                 // See if the single Jamo has a 24-bit order.
    974                 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) {
    975                     roffset += 1;
    976                 }
    977             }
    978 
    979             if (roffset < maxOffset) {
    980                 rlength = minLengthInChars(ceList, roffset, history);
    981             }
    982 
    983             if (rlength < 0) {
    984                 return -1;
    985             }
    986 
    987             shortestLength = 1 + rlength;
    988             goto have_shortest;
    989         }
    990 
    991         // Can't handle it manually either. Just move on.
    992         return -1;
    993     }
    994 
    995 have_shortest:
    996     history[offset] = shortestLength;
    997 
    998     return shortestLength;
    999 }
   1000 
   1001 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const
   1002 {
   1003     int32_t clength = ceList->size();
   1004     int32_t *history = NEW_ARRAY(int32_t, clength);
   1005 
   1006     for (int32_t i = 0; i < clength; i += 1) {
   1007         history[i] = -1;
   1008     }
   1009 
   1010     int32_t minLength = minLengthInChars(ceList, offset, history);
   1011 
   1012     DELETE_ARRAY(history);
   1013 
   1014     return minLength;
   1015 }
   1016 
   1017 CollData *CollData::open(UCollator *collator, UErrorCode &status)
   1018 {
   1019     if (U_FAILURE(status)) {
   1020         return NULL;
   1021     }
   1022 
   1023     CollDataCache *cache = getCollDataCache();
   1024 
   1025     return cache->get(collator, status);
   1026 }
   1027 
   1028 void CollData::close(CollData *collData)
   1029 {
   1030     CollDataCache *cache = getCollDataCache();
   1031 
   1032     cache->unref(collData);
   1033 }
   1034 
   1035 CollDataCache *CollData::collDataCache = NULL;
   1036 
   1037 CollDataCache *CollData::getCollDataCache()
   1038 {
   1039     UErrorCode status = U_ZERO_ERROR;
   1040     CollDataCache *cache = NULL;
   1041 
   1042     UMTX_CHECK(NULL, collDataCache, cache);
   1043 
   1044     if (cache == NULL) {
   1045         cache = new CollDataCache(status);
   1046 
   1047         if (U_FAILURE(status)) {
   1048             delete cache;
   1049             return NULL;
   1050         }
   1051 
   1052         umtx_lock(NULL);
   1053         if (collDataCache == NULL) {
   1054             collDataCache = cache;
   1055 
   1056             ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup);
   1057         }
   1058         umtx_unlock(NULL);
   1059 
   1060         if (collDataCache != cache) {
   1061             delete cache;
   1062         }
   1063     }
   1064 
   1065     return collDataCache;
   1066 }
   1067 
   1068 void CollData::freeCollDataCache()
   1069 {
   1070     CollDataCache *cache = NULL;
   1071 
   1072     UMTX_CHECK(NULL, collDataCache, cache);
   1073 
   1074     if (cache != NULL) {
   1075         umtx_lock(NULL);
   1076         if (collDataCache != NULL) {
   1077             collDataCache = NULL;
   1078         } else {
   1079             cache = NULL;
   1080         }
   1081         umtx_unlock(NULL);
   1082 
   1083         delete cache;
   1084     }
   1085 }
   1086 
   1087 void CollData::flushCollDataCache()
   1088 {
   1089     CollDataCache *cache = NULL;
   1090 
   1091     UMTX_CHECK(NULL, collDataCache, cache);
   1092 
   1093     // **** this will fail if the another ****
   1094     // **** thread deletes the cache here ****
   1095     if (cache != NULL) {
   1096         cache->flush();
   1097     }
   1098 }
   1099 
   1100 U_NAMESPACE_END
   1101 
   1102 #endif // #if !UCONFIG_NO_COLLATION
   1103