1 /* 2 ****************************************************************************** 3 * Copyright (C) 1996-2009, International Business Machines * 4 * Corporation and others. All Rights Reserved. * 5 ****************************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_COLLATION 11 12 #include "unicode/unistr.h" 13 #include "unicode/putil.h" 14 #include "unicode/usearch.h" 15 16 #include "cmemory.h" 17 #include "unicode/coll.h" 18 #include "unicode/tblcoll.h" 19 #include "unicode/coleitr.h" 20 #include "unicode/ucoleitr.h" 21 22 #include "unicode/regex.h" // TODO: make conditional on regexp being built. 23 24 #include "unicode/uniset.h" 25 #include "unicode/uset.h" 26 #include "unicode/ustring.h" 27 #include "hash.h" 28 #include "uhash.h" 29 #include "ucln_in.h" 30 #include "ucol_imp.h" 31 #include "umutex.h" 32 33 #include "unicode/colldata.h" 34 35 U_NAMESPACE_BEGIN 36 37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) 38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) 39 #define DELETE_ARRAY(array) uprv_free((void *) (array)) 40 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0]) 41 42 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList) 43 44 #ifdef INSTRUMENT_CELIST 45 int32_t CEList::_active = 0; 46 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 47 #endif 48 49 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status) 50 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0) 51 { 52 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); 53 UCollationStrength strength = ucol_getStrength(coll); 54 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; 55 uint32_t variableTop = ucol_getVariableTop(coll, &status); 56 uint32_t strengthMask = 0; 57 int32_t order; 58 59 if (U_FAILURE(status)) { 60 return; 61 } 62 63 // **** only set flag if string has Han(gul) **** 64 ucol_forceHanImplicit(elems, &status); 65 66 switch (strength) 67 { 68 default: 69 strengthMask |= UCOL_TERTIARYORDERMASK; 70 /* fall through */ 71 72 case UCOL_SECONDARY: 73 strengthMask |= UCOL_SECONDARYORDERMASK; 74 /* fall through */ 75 76 case UCOL_PRIMARY: 77 strengthMask |= UCOL_PRIMARYORDERMASK; 78 } 79 80 #ifdef INSTRUMENT_CELIST 81 _active += 1; 82 _histogram[0] += 1; 83 #endif 84 85 ces = ceBuffer; 86 87 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) { 88 UBool cont = isContinuation(order); 89 90 order &= strengthMask; 91 92 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) { 93 if (strength >= UCOL_QUATERNARY) { 94 order &= UCOL_PRIMARYORDERMASK; 95 } else { 96 order = UCOL_IGNORABLE; 97 } 98 } 99 100 if (order == UCOL_IGNORABLE) { 101 continue; 102 } 103 104 if (cont) { 105 order |= UCOL_CONTINUATION_MARKER; 106 } 107 108 add(order, status); 109 } 110 111 ucol_closeElements(elems); 112 } 113 114 CEList::~CEList() 115 { 116 #ifdef INSTRUMENT_CELIST 117 _active -= 1; 118 #endif 119 120 if (ces != ceBuffer) { 121 DELETE_ARRAY(ces); 122 } 123 } 124 125 void CEList::add(uint32_t ce, UErrorCode &status) 126 { 127 if (U_FAILURE(status)) { 128 return; 129 } 130 131 if (listSize >= listMax) { 132 int32_t newMax = listMax + CELIST_BUFFER_SIZE; 133 134 #ifdef INSTRUMENT_CELIST 135 _histogram[listSize / CELIST_BUFFER_SIZE] += 1; 136 #endif 137 138 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax); 139 140 if (newCEs == NULL) { 141 status = U_MEMORY_ALLOCATION_ERROR; 142 return; 143 } 144 145 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t)); 146 147 if (ces != ceBuffer) { 148 DELETE_ARRAY(ces); 149 } 150 151 ces = newCEs; 152 listMax = newMax; 153 } 154 155 ces[listSize++] = ce; 156 } 157 158 uint32_t CEList::get(int32_t index) const 159 { 160 if (index >= 0 && index < listSize) { 161 return ces[index]; 162 } 163 164 return UCOL_NULLORDER; 165 } 166 167 uint32_t &CEList::operator[](int32_t index) const 168 { 169 return ces[index]; 170 } 171 172 UBool CEList::matchesAt(int32_t offset, const CEList *other) const 173 { 174 if (other == NULL || listSize - offset < other->size()) { 175 return FALSE; 176 } 177 178 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) { 179 if (ces[i] != (*other)[j]) { 180 return FALSE; 181 } 182 } 183 184 return TRUE; 185 } 186 187 int32_t CEList::size() const 188 { 189 return listSize; 190 } 191 192 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList) 193 194 #ifdef INSTRUMENT_STRING_LIST 195 int32_t StringList::_lists = 0; 196 int32_t StringList::_strings = 0; 197 int32_t StringList::_histogram[101] = {0}; 198 #endif 199 200 StringList::StringList(UErrorCode &status) 201 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0) 202 { 203 if (U_FAILURE(status)) { 204 return; 205 } 206 207 strings = new UnicodeString [listMax]; 208 209 if (strings == NULL) { 210 status = U_MEMORY_ALLOCATION_ERROR; 211 return; 212 } 213 214 #ifdef INSTRUMENT_STRING_LIST 215 _lists += 1; 216 _histogram[0] += 1; 217 #endif 218 } 219 220 StringList::~StringList() 221 { 222 delete[] strings; 223 } 224 225 void StringList::add(const UnicodeString *string, UErrorCode &status) 226 { 227 if (U_FAILURE(status)) { 228 return; 229 } 230 231 #ifdef INSTRUMENT_STRING_LIST 232 _strings += 1; 233 #endif 234 235 if (listSize >= listMax) { 236 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE; 237 238 UnicodeString *newStrings = new UnicodeString[newMax]; 239 if (newStrings == NULL) { 240 status = U_MEMORY_ALLOCATION_ERROR; 241 return; 242 } 243 for (int32_t i=0; i<listSize; ++i) { 244 newStrings[i] = strings[i]; 245 } 246 247 #ifdef INSTRUMENT_STRING_LIST 248 int32_t _h = listSize / STRING_LIST_BUFFER_SIZE; 249 250 if (_h > 100) { 251 _h = 100; 252 } 253 254 _histogram[_h] += 1; 255 #endif 256 257 delete[] strings; 258 strings = newStrings; 259 listMax = newMax; 260 } 261 262 // The ctor initialized all the strings in 263 // the array to empty strings, so this 264 // is the same as copying the source string. 265 strings[listSize++].append(*string); 266 } 267 268 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status) 269 { 270 const UnicodeString string(chars, count); 271 272 add(&string, status); 273 } 274 275 const UnicodeString *StringList::get(int32_t index) const 276 { 277 if (index >= 0 && index < listSize) { 278 return &strings[index]; 279 } 280 281 return NULL; 282 } 283 284 int32_t StringList::size() const 285 { 286 return listSize; 287 } 288 289 290 U_CFUNC void deleteStringList(void *obj); 291 292 class CEToStringsMap : public UMemory 293 { 294 public: 295 296 CEToStringsMap(UErrorCode &status); 297 ~CEToStringsMap(); 298 299 void put(uint32_t ce, UnicodeString *string, UErrorCode &status); 300 StringList *getStringList(uint32_t ce) const; 301 302 private: 303 304 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status); 305 UHashtable *map; 306 }; 307 308 CEToStringsMap::CEToStringsMap(UErrorCode &status) 309 : map(NULL) 310 { 311 if (U_FAILURE(status)) { 312 return; 313 } 314 315 map = uhash_open(uhash_hashLong, uhash_compareLong, 316 uhash_compareCaselessUnicodeString, 317 &status); 318 319 if (U_FAILURE(status)) { 320 return; 321 } 322 323 uhash_setValueDeleter(map, deleteStringList); 324 } 325 326 CEToStringsMap::~CEToStringsMap() 327 { 328 uhash_close(map); 329 } 330 331 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status) 332 { 333 StringList *strings = getStringList(ce); 334 335 if (strings == NULL) { 336 strings = new StringList(status); 337 338 if (strings == NULL || U_FAILURE(status)) { 339 status = U_MEMORY_ALLOCATION_ERROR; 340 return; 341 } 342 343 putStringList(ce, strings, status); 344 } 345 346 strings->add(string, status); 347 } 348 349 StringList *CEToStringsMap::getStringList(uint32_t ce) const 350 { 351 return (StringList *) uhash_iget(map, ce); 352 } 353 354 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status) 355 { 356 uhash_iput(map, ce, (void *) stringList, &status); 357 } 358 359 U_CFUNC void deleteStringList(void *obj) 360 { 361 StringList *strings = (StringList *) obj; 362 363 delete strings; 364 } 365 366 U_CFUNC void deleteCEList(void *obj); 367 U_CFUNC void deleteUnicodeStringKey(void *obj); 368 369 class StringToCEsMap : public UMemory 370 { 371 public: 372 StringToCEsMap(UErrorCode &status); 373 ~StringToCEsMap(); 374 375 void put(const UnicodeString *string, const CEList *ces, UErrorCode &status); 376 const CEList *get(const UnicodeString *string); 377 void free(const CEList *list); 378 379 private: 380 381 382 UHashtable *map; 383 }; 384 385 StringToCEsMap::StringToCEsMap(UErrorCode &status) 386 : map(NULL) 387 { 388 if (U_FAILURE(status)) { 389 return; 390 } 391 392 map = uhash_open(uhash_hashUnicodeString, 393 uhash_compareUnicodeString, 394 uhash_compareLong, 395 &status); 396 397 if (U_FAILURE(status)) { 398 return; 399 } 400 401 uhash_setValueDeleter(map, deleteCEList); 402 uhash_setKeyDeleter(map, deleteUnicodeStringKey); 403 } 404 405 StringToCEsMap::~StringToCEsMap() 406 { 407 uhash_close(map); 408 } 409 410 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status) 411 { 412 uhash_put(map, (void *) string, (void *) ces, &status); 413 } 414 415 const CEList *StringToCEsMap::get(const UnicodeString *string) 416 { 417 return (const CEList *) uhash_get(map, string); 418 } 419 420 U_CFUNC void deleteCEList(void *obj) 421 { 422 CEList *list = (CEList *) obj; 423 424 delete list; 425 } 426 427 U_CFUNC void deleteUnicodeStringKey(void *obj) 428 { 429 UnicodeString *key = (UnicodeString *) obj; 430 431 delete key; 432 } 433 434 class CollDataCacheEntry : public UMemory 435 { 436 public: 437 CollDataCacheEntry(CollData *theData); 438 ~CollDataCacheEntry(); 439 440 CollData *data; 441 int32_t refCount; 442 }; 443 444 CollDataCacheEntry::CollDataCacheEntry(CollData *theData) 445 : data(theData), refCount(1) 446 { 447 // nothing else to do 448 } 449 450 CollDataCacheEntry::~CollDataCacheEntry() 451 { 452 // check refCount? 453 delete data; 454 } 455 456 class CollDataCache : public UMemory 457 { 458 public: 459 CollDataCache(UErrorCode &status); 460 ~CollDataCache(); 461 462 CollData *get(UCollator *collator, UErrorCode &status); 463 void unref(CollData *collData); 464 465 void flush(); 466 467 private: 468 static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength); 469 static void deleteKey(char *key); 470 471 UMTX lock; 472 UHashtable *cache; 473 }; 474 475 U_CFUNC void deleteChars(void * /*obj*/) 476 { 477 // char *chars = (char *) obj; 478 // All the key strings are owned by the 479 // CollData objects and don't need to 480 // be freed here. 481 //DELETE_ARRAY(chars); 482 } 483 484 U_CFUNC void deleteCollDataCacheEntry(void *obj) 485 { 486 CollDataCacheEntry *entry = (CollDataCacheEntry *) obj; 487 488 delete entry; 489 } 490 491 CollDataCache::CollDataCache(UErrorCode &status) 492 : lock(0), cache(NULL) 493 { 494 if (U_FAILURE(status)) { 495 return; 496 } 497 498 cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status); 499 500 if (U_FAILURE(status)) { 501 return; 502 } 503 504 uhash_setValueDeleter(cache, deleteCollDataCacheEntry); 505 uhash_setKeyDeleter(cache, deleteChars); 506 } 507 508 CollDataCache::~CollDataCache() 509 { 510 umtx_lock(&lock); 511 uhash_close(cache); 512 cache = NULL; 513 umtx_unlock(&lock); 514 515 umtx_destroy(&lock); 516 } 517 518 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status) 519 { 520 char keyBuffer[KEY_BUFFER_SIZE]; 521 int32_t keyLength = KEY_BUFFER_SIZE; 522 char *key = getKey(collator, keyBuffer, &keyLength); 523 CollData *result = NULL, *newData = NULL; 524 CollDataCacheEntry *entry = NULL, *newEntry = NULL; 525 526 umtx_lock(&lock); 527 entry = (CollDataCacheEntry *) uhash_get(cache, key); 528 529 if (entry == NULL) { 530 umtx_unlock(&lock); 531 532 newData = new CollData(collator, key, keyLength, status); 533 newEntry = new CollDataCacheEntry(newData); 534 535 if (U_FAILURE(status) || newData == NULL || newEntry == NULL) { 536 status = U_MEMORY_ALLOCATION_ERROR; 537 return NULL; 538 } 539 540 umtx_lock(&lock); 541 entry = (CollDataCacheEntry *) uhash_get(cache, key); 542 543 if (entry == NULL) { 544 uhash_put(cache, newData->key, newEntry, &status); 545 umtx_unlock(&lock); 546 547 if (U_FAILURE(status)) { 548 delete newEntry; 549 delete newData; 550 551 return NULL; 552 } 553 554 return newData; 555 } 556 } 557 558 result = entry->data; 559 entry->refCount += 1; 560 umtx_unlock(&lock); 561 562 if (key != keyBuffer) { 563 deleteKey(key); 564 } 565 566 if (newEntry != NULL) { 567 delete newEntry; 568 delete newData; 569 } 570 571 return result; 572 } 573 574 void CollDataCache::unref(CollData *collData) 575 { 576 CollDataCacheEntry *entry = NULL; 577 578 umtx_lock(&lock); 579 entry = (CollDataCacheEntry *) uhash_get(cache, collData->key); 580 581 if (entry != NULL) { 582 entry->refCount -= 1; 583 } 584 umtx_unlock(&lock); 585 } 586 587 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength) 588 { 589 UErrorCode status = U_ZERO_ERROR; 590 int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status); 591 592 if (len >= *keyBufferLength) { 593 *keyBufferLength = (len + 2) & ~1; // round to even length, leaving room for terminating null 594 keyBuffer = NEW_ARRAY(char, *keyBufferLength); 595 status = U_ZERO_ERROR; 596 597 len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status); 598 } 599 600 keyBuffer[len] = '\0'; 601 602 return keyBuffer; 603 } 604 605 void CollDataCache::flush() 606 { 607 const UHashElement *element; 608 int32_t pos = -1; 609 610 umtx_lock(&lock); 611 while ((element = uhash_nextElement(cache, &pos)) != NULL) { 612 CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer; 613 614 if (entry->refCount <= 0) { 615 uhash_removeElement(cache, element); 616 } 617 } 618 umtx_unlock(&lock); 619 } 620 621 void CollDataCache::deleteKey(char *key) 622 { 623 DELETE_ARRAY(key); 624 } 625 626 U_CDECL_BEGIN 627 static UBool coll_data_cleanup(void) { 628 CollData::freeCollDataCache(); 629 return TRUE; 630 } 631 U_CDECL_END 632 633 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData) 634 635 CollData::CollData() 636 { 637 // nothing 638 } 639 640 #define CLONE_COLLATOR 641 642 //#define CACHE_CELISTS 643 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status) 644 : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL) 645 { 646 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]] 647 // i.e. other, control, private use, format, surrogate 648 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20); 649 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20); 650 USet *charsToTest = uset_openPattern(test_pattern, 20, &status); 651 652 // Han ext. A, Han, Jamo, Hangul, Han Ext. B 653 // i.e. all the characers we handle implicitly 654 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); 655 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); 656 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status); 657 658 if (U_FAILURE(status)) { 659 return; 660 } 661 662 USet *expansions = uset_openEmpty(); 663 USet *contractions = uset_openEmpty(); 664 int32_t itemCount; 665 666 #ifdef CACHE_CELISTS 667 charsToCEList = new StringToCEsMap(status); 668 669 if (U_FAILURE(status)) { 670 goto bail; 671 } 672 #else 673 charsToCEList = NULL; 674 #endif 675 676 ceToCharsStartingWith = new CEToStringsMap(status); 677 678 if (U_FAILURE(status)) { 679 goto bail; 680 } 681 682 if (cacheKeyLength > KEY_BUFFER_SIZE) { 683 key = NEW_ARRAY(char, cacheKeyLength); 684 685 if (key == NULL) { 686 status = U_MEMORY_ALLOCATION_ERROR; 687 goto bail; 688 } 689 } else { 690 key = keyBuffer; 691 } 692 693 ARRAY_COPY(key, cacheKey, cacheKeyLength); 694 695 #ifdef CLONE_COLLATOR 696 coll = ucol_safeClone(collator, NULL, NULL, &status); 697 698 if (U_FAILURE(status)) { 699 goto bail; 700 } 701 #else 702 coll = collator; 703 #endif 704 705 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); 706 707 uset_addAll(charsToTest, contractions); 708 uset_addAll(charsToTest, expansions); 709 uset_removeAll(charsToTest, charsToRemove); 710 711 itemCount = uset_getItemCount(charsToTest); 712 for(int32_t item = 0; item < itemCount; item += 1) { 713 UChar32 start = 0, end = 0; 714 UChar buffer[16]; 715 int32_t len = uset_getItem(charsToTest, item, &start, &end, 716 buffer, 16, &status); 717 718 if (len == 0) { 719 for (UChar32 ch = start; ch <= end; ch += 1) { 720 UnicodeString *st = new UnicodeString(ch); 721 722 if (st == NULL) { 723 status = U_MEMORY_ALLOCATION_ERROR; 724 break; 725 } 726 727 CEList *ceList = new CEList(coll, *st, status); 728 729 ceToCharsStartingWith->put(ceList->get(0), st, status); 730 731 #ifdef CACHE_CELISTS 732 charsToCEList->put(st, ceList, status); 733 #else 734 delete ceList; 735 delete st; 736 #endif 737 } 738 } else if (len > 0) { 739 UnicodeString *st = new UnicodeString(buffer, len); 740 741 if (st == NULL) { 742 status = U_MEMORY_ALLOCATION_ERROR; 743 break; 744 } 745 746 CEList *ceList = new CEList(coll, *st, status); 747 748 ceToCharsStartingWith->put(ceList->get(0), st, status); 749 750 #ifdef CACHE_CELISTS 751 charsToCEList->put(st, ceList, status); 752 #else 753 delete ceList; 754 delete st; 755 #endif 756 } else { 757 // shouldn't happen... 758 } 759 760 if (U_FAILURE(status)) { 761 break; 762 } 763 } 764 765 bail: 766 uset_close(contractions); 767 uset_close(expansions); 768 uset_close(charsToRemove); 769 uset_close(charsToTest); 770 771 if (U_FAILURE(status)) { 772 return; 773 } 774 775 UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A, 776 UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B}; 777 UChar jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO}; 778 UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges)); 779 UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges)); 780 CEList hanList(coll, hanString, status); 781 CEList jamoList(coll, jamoString, status); 782 int32_t j = 0; 783 784 if (U_FAILURE(status)) { 785 return; 786 } 787 788 for (int32_t c = 0; c < jamoList.size(); c += 1) { 789 uint32_t jce = jamoList[c]; 790 791 if (! isContinuation(jce)) { 792 jamoLimits[j++] = jce; 793 } 794 } 795 796 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT); 797 798 minHan = 0xFFFFFFFF; 799 maxHan = 0; 800 801 for(int32_t h = 0; h < hanList.size(); h += 2) { 802 uint32_t han = (uint32_t) hanList[h]; 803 804 if (han < minHan) { 805 minHan = han; 806 } 807 808 if (han > maxHan) { 809 maxHan = han; 810 } 811 } 812 813 maxHan += (1 << UCOL_PRIMARYORDERSHIFT); 814 } 815 816 CollData::~CollData() 817 { 818 #ifdef CLONE_COLLATOR 819 ucol_close(coll); 820 #endif 821 822 if (key != keyBuffer) { 823 DELETE_ARRAY(key); 824 } 825 826 delete ceToCharsStartingWith; 827 828 #ifdef CACHE_CELISTS 829 delete charsToCEList; 830 #endif 831 } 832 833 UCollator *CollData::getCollator() const 834 { 835 return coll; 836 } 837 838 const StringList *CollData::getStringList(int32_t ce) const 839 { 840 return ceToCharsStartingWith->getStringList(ce); 841 } 842 843 const CEList *CollData::getCEList(const UnicodeString *string) const 844 { 845 #ifdef CACHE_CELISTS 846 return charsToCEList->get(string); 847 #else 848 UErrorCode status = U_ZERO_ERROR; 849 const CEList *list = new CEList(coll, *string, status); 850 851 if (U_FAILURE(status)) { 852 delete list; 853 list = NULL; 854 } 855 856 return list; 857 #endif 858 } 859 860 void CollData::freeCEList(const CEList *list) 861 { 862 #ifndef CACHE_CELISTS 863 delete list; 864 #endif 865 } 866 867 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const 868 { 869 // find out shortest string for the longest sequence of ces. 870 // this can probably be folded with the minLengthCache... 871 872 if (history[offset] >= 0) { 873 return history[offset]; 874 } 875 876 uint32_t ce = ceList->get(offset); 877 int32_t maxOffset = ceList->size(); 878 int32_t shortestLength = INT32_MAX; 879 const StringList *strings = ceToCharsStartingWith->getStringList(ce); 880 881 if (strings != NULL) { 882 int32_t stringCount = strings->size(); 883 884 for (int32_t s = 0; s < stringCount; s += 1) { 885 const UnicodeString *string = strings->get(s); 886 #ifdef CACHE_CELISTS 887 const CEList *ceList2 = charsToCEList->get(string); 888 #else 889 UErrorCode status = U_ZERO_ERROR; 890 const CEList *ceList2 = new CEList(coll, *string, status); 891 892 if (U_FAILURE(status)) { 893 delete ceList2; 894 ceList2 = NULL; 895 } 896 #endif 897 898 if (ceList->matchesAt(offset, ceList2)) { 899 int32_t clength = ceList2->size(); 900 int32_t slength = string->length(); 901 int32_t roffset = offset + clength; 902 int32_t rlength = 0; 903 904 if (roffset < maxOffset) { 905 rlength = minLengthInChars(ceList, roffset, history); 906 907 if (rlength <= 0) { 908 // delete before continue to avoid memory leak. 909 #ifndef CACHE_CELISTS 910 delete ceList2; 911 #endif 912 // ignore any dead ends 913 continue; 914 } 915 } 916 917 if (shortestLength > slength + rlength) { 918 shortestLength = slength + rlength; 919 } 920 } 921 922 #ifndef CACHE_CELISTS 923 delete ceList2; 924 #endif 925 } 926 } 927 928 if (shortestLength == INT32_MAX) { 929 // No matching strings at this offset. See if 930 // the CE is in a range we can handle manually. 931 if (ce >= minHan && ce < maxHan) { 932 // all han have implicit orders which 933 // generate two CEs. 934 int32_t roffset = offset + 2; 935 int32_t rlength = 0; 936 937 //history[roffset++] = -1; 938 //history[roffset++] = 1; 939 940 if (roffset < maxOffset) { 941 rlength = minLengthInChars(ceList, roffset, history); 942 } 943 944 if (rlength < 0) { 945 return -1; 946 } 947 948 shortestLength = 1 + rlength; 949 goto have_shortest; 950 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) { 951 int32_t roffset = offset; 952 int32_t rlength = 0; 953 954 // **** this loop may not handle archaic Hangul correctly **** 955 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) { 956 uint32_t jce = ceList->get(roffset); 957 958 // Some Jamo have 24-bit primary order; skip the 959 // 2nd CE. This should always be OK because if 960 // we're still in the loop all we've seen are 961 // a series of Jamo in LVT order. 962 if (isContinuation(jce)) { 963 continue; 964 } 965 966 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) { 967 break; 968 } 969 } 970 971 if (roffset == offset) { 972 // we started with a non-L Jamo... 973 // just say it comes from a single character 974 roffset += 1; 975 976 // See if the single Jamo has a 24-bit order. 977 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) { 978 roffset += 1; 979 } 980 } 981 982 if (roffset < maxOffset) { 983 rlength = minLengthInChars(ceList, roffset, history); 984 } 985 986 if (rlength < 0) { 987 return -1; 988 } 989 990 shortestLength = 1 + rlength; 991 goto have_shortest; 992 } 993 994 // Can't handle it manually either. Just move on. 995 return -1; 996 } 997 998 have_shortest: 999 history[offset] = shortestLength; 1000 1001 return shortestLength; 1002 } 1003 1004 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const 1005 { 1006 int32_t clength = ceList->size(); 1007 int32_t *history = NEW_ARRAY(int32_t, clength); 1008 1009 for (int32_t i = 0; i < clength; i += 1) { 1010 history[i] = -1; 1011 } 1012 1013 int32_t minLength = minLengthInChars(ceList, offset, history); 1014 1015 DELETE_ARRAY(history); 1016 1017 return minLength; 1018 } 1019 1020 CollData *CollData::open(UCollator *collator, UErrorCode &status) 1021 { 1022 if (U_FAILURE(status)) { 1023 return NULL; 1024 } 1025 1026 CollDataCache *cache = getCollDataCache(); 1027 1028 return cache->get(collator, status); 1029 } 1030 1031 void CollData::close(CollData *collData) 1032 { 1033 CollDataCache *cache = getCollDataCache(); 1034 1035 cache->unref(collData); 1036 } 1037 1038 CollDataCache *CollData::collDataCache = NULL; 1039 1040 CollDataCache *CollData::getCollDataCache() 1041 { 1042 UErrorCode status = U_ZERO_ERROR; 1043 CollDataCache *cache = NULL; 1044 1045 UMTX_CHECK(NULL, collDataCache, cache); 1046 1047 if (cache == NULL) { 1048 cache = new CollDataCache(status); 1049 1050 if (U_FAILURE(status)) { 1051 delete cache; 1052 return NULL; 1053 } 1054 1055 umtx_lock(NULL); 1056 if (collDataCache == NULL) { 1057 collDataCache = cache; 1058 1059 ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup); 1060 } 1061 umtx_unlock(NULL); 1062 1063 if (collDataCache != cache) { 1064 delete cache; 1065 } 1066 } 1067 1068 return collDataCache; 1069 } 1070 1071 void CollData::freeCollDataCache() 1072 { 1073 CollDataCache *cache = NULL; 1074 1075 UMTX_CHECK(NULL, collDataCache, cache); 1076 1077 if (cache != NULL) { 1078 umtx_lock(NULL); 1079 if (collDataCache != NULL) { 1080 collDataCache = NULL; 1081 } else { 1082 cache = NULL; 1083 } 1084 umtx_unlock(NULL); 1085 1086 delete cache; 1087 } 1088 } 1089 1090 void CollData::flushCollDataCache() 1091 { 1092 CollDataCache *cache = NULL; 1093 1094 UMTX_CHECK(NULL, collDataCache, cache); 1095 1096 // **** this will fail if the another **** 1097 // **** thread deletes the cache here **** 1098 if (cache != NULL) { 1099 cache->flush(); 1100 } 1101 } 1102 1103 U_NAMESPACE_END 1104 1105 #endif // #if !UCONFIG_NO_COLLATION 1106