1 /* 2 ****************************************************************************** 3 * Copyright (C) 1996-2009, International Business Machines * 4 * Corporation and others. All Rights Reserved. * 5 ****************************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_COLLATION 11 12 #include "unicode/unistr.h" 13 #include "unicode/putil.h" 14 #include "unicode/usearch.h" 15 16 #include "cmemory.h" 17 #include "unicode/coll.h" 18 #include "unicode/tblcoll.h" 19 #include "unicode/coleitr.h" 20 #include "unicode/ucoleitr.h" 21 22 #include "unicode/regex.h" // TODO: make conditional on regexp being built. 23 24 #include "unicode/uniset.h" 25 #include "unicode/uset.h" 26 #include "unicode/ustring.h" 27 #include "hash.h" 28 #include "uhash.h" 29 #include "ucln_in.h" 30 #include "ucol_imp.h" 31 #include "umutex.h" 32 33 #include "unicode/colldata.h" 34 35 U_NAMESPACE_BEGIN 36 37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) 38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) 39 #define DELETE_ARRAY(array) uprv_free((void *) (array)) 40 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0]) 41 42 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList) 43 44 #ifdef INSTRUMENT_CELIST 45 int32_t CEList::_active = 0; 46 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 47 #endif 48 49 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status) 50 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0) 51 { 52 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); 53 UCollationStrength strength = ucol_getStrength(coll); 54 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; 55 uint32_t variableTop = ucol_getVariableTop(coll, &status); 56 uint32_t strengthMask = 0; 57 int32_t order; 58 59 if (U_FAILURE(status)) { 60 return; 61 } 62 63 // **** only set flag if string has Han(gul) **** 64 ucol_forceHanImplicit(elems, &status); 65 66 switch (strength) 67 { 68 default: 69 strengthMask |= UCOL_TERTIARYORDERMASK; 70 /* fall through */ 71 72 case UCOL_SECONDARY: 73 strengthMask |= UCOL_SECONDARYORDERMASK; 74 /* fall through */ 75 76 case UCOL_PRIMARY: 77 strengthMask |= UCOL_PRIMARYORDERMASK; 78 } 79 80 #ifdef INSTRUMENT_CELIST 81 _active += 1; 82 _histogram[0] += 1; 83 #endif 84 85 ces = ceBuffer; 86 87 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) { 88 UBool cont = isContinuation(order); 89 90 order &= strengthMask; 91 92 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) { 93 if (strength >= UCOL_QUATERNARY) { 94 order &= UCOL_PRIMARYORDERMASK; 95 } else { 96 order = UCOL_IGNORABLE; 97 } 98 } 99 100 if (order == UCOL_IGNORABLE) { 101 continue; 102 } 103 104 if (cont) { 105 order |= UCOL_CONTINUATION_MARKER; 106 } 107 108 add(order, status); 109 } 110 111 ucol_closeElements(elems); 112 } 113 114 CEList::~CEList() 115 { 116 #ifdef INSTRUMENT_CELIST 117 _active -= 1; 118 #endif 119 120 if (ces != ceBuffer) { 121 DELETE_ARRAY(ces); 122 } 123 } 124 125 void CEList::add(uint32_t ce, UErrorCode &status) 126 { 127 if (U_FAILURE(status)) { 128 return; 129 } 130 131 if (listSize >= listMax) { 132 int32_t newMax = listMax + CELIST_BUFFER_SIZE; 133 134 #ifdef INSTRUMENT_CELIST 135 _histogram[listSize / CELIST_BUFFER_SIZE] += 1; 136 #endif 137 138 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax); 139 140 if (newCEs == NULL) { 141 status = U_MEMORY_ALLOCATION_ERROR; 142 return; 143 } 144 145 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t)); 146 147 if (ces != ceBuffer) { 148 DELETE_ARRAY(ces); 149 } 150 151 ces = newCEs; 152 listMax = newMax; 153 } 154 155 ces[listSize++] = ce; 156 } 157 158 uint32_t CEList::get(int32_t index) const 159 { 160 if (index >= 0 && index < listSize) { 161 return ces[index]; 162 } 163 164 return UCOL_NULLORDER; 165 } 166 167 uint32_t &CEList::operator[](int32_t index) const 168 { 169 return ces[index]; 170 } 171 172 UBool CEList::matchesAt(int32_t offset, const CEList *other) const 173 { 174 if (other == NULL || listSize - offset < other->size()) { 175 return FALSE; 176 } 177 178 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) { 179 if (ces[i] != (*other)[j]) { 180 return FALSE; 181 } 182 } 183 184 return TRUE; 185 } 186 187 int32_t CEList::size() const 188 { 189 return listSize; 190 } 191 192 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList) 193 194 #ifdef INSTRUMENT_STRING_LIST 195 int32_t StringList::_lists = 0; 196 int32_t StringList::_strings = 0; 197 int32_t StringList::_histogram[101] = {0}; 198 #endif 199 200 StringList::StringList(UErrorCode &status) 201 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0) 202 { 203 if (U_FAILURE(status)) { 204 return; 205 } 206 207 strings = new UnicodeString [listMax]; 208 209 if (strings == NULL) { 210 status = U_MEMORY_ALLOCATION_ERROR; 211 return; 212 } 213 214 #ifdef INSTRUMENT_STRING_LIST 215 _lists += 1; 216 _histogram[0] += 1; 217 #endif 218 } 219 220 StringList::~StringList() 221 { 222 delete[] strings; 223 } 224 225 void StringList::add(const UnicodeString *string, UErrorCode &status) 226 { 227 if (U_FAILURE(status)) { 228 return; 229 } 230 231 #ifdef INSTRUMENT_STRING_LIST 232 _strings += 1; 233 #endif 234 235 if (listSize >= listMax) { 236 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE; 237 238 UnicodeString *newStrings = new UnicodeString[newMax]; 239 240 uprv_memcpy(newStrings, strings, listSize * sizeof(UnicodeString)); 241 242 #ifdef INSTRUMENT_STRING_LIST 243 int32_t _h = listSize / STRING_LIST_BUFFER_SIZE; 244 245 if (_h > 100) { 246 _h = 100; 247 } 248 249 _histogram[_h] += 1; 250 #endif 251 252 delete[] strings; 253 strings = newStrings; 254 listMax = newMax; 255 } 256 257 // The ctor initialized all the strings in 258 // the array to empty strings, so this 259 // is the same as copying the source string. 260 strings[listSize++].append(*string); 261 } 262 263 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status) 264 { 265 const UnicodeString string(chars, count); 266 267 add(&string, status); 268 } 269 270 const UnicodeString *StringList::get(int32_t index) const 271 { 272 if (index >= 0 && index < listSize) { 273 return &strings[index]; 274 } 275 276 return NULL; 277 } 278 279 int32_t StringList::size() const 280 { 281 return listSize; 282 } 283 284 285 U_CFUNC void deleteStringList(void *obj); 286 287 class CEToStringsMap : public UMemory 288 { 289 public: 290 291 CEToStringsMap(UErrorCode &status); 292 ~CEToStringsMap(); 293 294 void put(uint32_t ce, UnicodeString *string, UErrorCode &status); 295 StringList *getStringList(uint32_t ce) const; 296 297 private: 298 299 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status); 300 UHashtable *map; 301 }; 302 303 CEToStringsMap::CEToStringsMap(UErrorCode &status) 304 : map(NULL) 305 { 306 if (U_FAILURE(status)) { 307 return; 308 } 309 310 map = uhash_open(uhash_hashLong, uhash_compareLong, 311 uhash_compareCaselessUnicodeString, 312 &status); 313 314 if (U_FAILURE(status)) { 315 return; 316 } 317 318 uhash_setValueDeleter(map, deleteStringList); 319 } 320 321 CEToStringsMap::~CEToStringsMap() 322 { 323 uhash_close(map); 324 } 325 326 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status) 327 { 328 StringList *strings = getStringList(ce); 329 330 if (strings == NULL) { 331 strings = new StringList(status); 332 333 if (strings == NULL || U_FAILURE(status)) { 334 status = U_MEMORY_ALLOCATION_ERROR; 335 return; 336 } 337 338 putStringList(ce, strings, status); 339 } 340 341 strings->add(string, status); 342 } 343 344 StringList *CEToStringsMap::getStringList(uint32_t ce) const 345 { 346 return (StringList *) uhash_iget(map, ce); 347 } 348 349 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status) 350 { 351 uhash_iput(map, ce, (void *) stringList, &status); 352 } 353 354 U_CFUNC void deleteStringList(void *obj) 355 { 356 StringList *strings = (StringList *) obj; 357 358 delete strings; 359 } 360 361 U_CFUNC void deleteCEList(void *obj); 362 U_CFUNC void deleteUnicodeStringKey(void *obj); 363 364 class StringToCEsMap : public UMemory 365 { 366 public: 367 StringToCEsMap(UErrorCode &status); 368 ~StringToCEsMap(); 369 370 void put(const UnicodeString *string, const CEList *ces, UErrorCode &status); 371 const CEList *get(const UnicodeString *string); 372 void free(const CEList *list); 373 374 private: 375 376 377 UHashtable *map; 378 }; 379 380 StringToCEsMap::StringToCEsMap(UErrorCode &status) 381 : map(NULL) 382 { 383 if (U_FAILURE(status)) { 384 return; 385 } 386 387 map = uhash_open(uhash_hashUnicodeString, 388 uhash_compareUnicodeString, 389 uhash_compareLong, 390 &status); 391 392 if (U_FAILURE(status)) { 393 return; 394 } 395 396 uhash_setValueDeleter(map, deleteCEList); 397 uhash_setKeyDeleter(map, deleteUnicodeStringKey); 398 } 399 400 StringToCEsMap::~StringToCEsMap() 401 { 402 uhash_close(map); 403 } 404 405 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status) 406 { 407 uhash_put(map, (void *) string, (void *) ces, &status); 408 } 409 410 const CEList *StringToCEsMap::get(const UnicodeString *string) 411 { 412 return (const CEList *) uhash_get(map, string); 413 } 414 415 U_CFUNC void deleteCEList(void *obj) 416 { 417 CEList *list = (CEList *) obj; 418 419 delete list; 420 } 421 422 U_CFUNC void deleteUnicodeStringKey(void *obj) 423 { 424 UnicodeString *key = (UnicodeString *) obj; 425 426 delete key; 427 } 428 429 class CollDataCacheEntry : public UMemory 430 { 431 public: 432 CollDataCacheEntry(CollData *theData); 433 ~CollDataCacheEntry(); 434 435 CollData *data; 436 int32_t refCount; 437 }; 438 439 CollDataCacheEntry::CollDataCacheEntry(CollData *theData) 440 : data(theData), refCount(1) 441 { 442 // nothing else to do 443 } 444 445 CollDataCacheEntry::~CollDataCacheEntry() 446 { 447 // check refCount? 448 delete data; 449 } 450 451 class CollDataCache : public UMemory 452 { 453 public: 454 CollDataCache(UErrorCode &status); 455 ~CollDataCache(); 456 457 CollData *get(UCollator *collator, UErrorCode &status); 458 void unref(CollData *collData); 459 460 void flush(); 461 462 private: 463 static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength); 464 static void deleteKey(char *key); 465 466 UMTX lock; 467 UHashtable *cache; 468 }; 469 470 U_CFUNC void deleteChars(void * /*obj*/) 471 { 472 // char *chars = (char *) obj; 473 // All the key strings are owned by the 474 // CollData objects and don't need to 475 // be freed here. 476 //DELETE_ARRAY(chars); 477 } 478 479 U_CFUNC void deleteCollDataCacheEntry(void *obj) 480 { 481 CollDataCacheEntry *entry = (CollDataCacheEntry *) obj; 482 483 delete entry; 484 } 485 486 CollDataCache::CollDataCache(UErrorCode &status) 487 : lock(0), cache(NULL) 488 { 489 if (U_FAILURE(status)) { 490 return; 491 } 492 493 cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status); 494 495 if (U_FAILURE(status)) { 496 return; 497 } 498 499 uhash_setValueDeleter(cache, deleteCollDataCacheEntry); 500 uhash_setKeyDeleter(cache, deleteChars); 501 } 502 503 CollDataCache::~CollDataCache() 504 { 505 umtx_lock(&lock); 506 uhash_close(cache); 507 cache = NULL; 508 umtx_unlock(&lock); 509 510 umtx_destroy(&lock); 511 } 512 513 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status) 514 { 515 char keyBuffer[KEY_BUFFER_SIZE]; 516 int32_t keyLength = KEY_BUFFER_SIZE; 517 char *key = getKey(collator, keyBuffer, &keyLength); 518 CollData *result = NULL, *newData = NULL; 519 CollDataCacheEntry *entry = NULL, *newEntry = NULL; 520 521 umtx_lock(&lock); 522 entry = (CollDataCacheEntry *) uhash_get(cache, key); 523 524 if (entry == NULL) { 525 umtx_unlock(&lock); 526 527 newData = new CollData(collator, key, keyLength, status); 528 newEntry = new CollDataCacheEntry(newData); 529 530 if (U_FAILURE(status) || newData == NULL || newEntry == NULL) { 531 status = U_MEMORY_ALLOCATION_ERROR; 532 return NULL; 533 } 534 535 umtx_lock(&lock); 536 entry = (CollDataCacheEntry *) uhash_get(cache, key); 537 538 if (entry == NULL) { 539 uhash_put(cache, newData->key, newEntry, &status); 540 umtx_unlock(&lock); 541 542 if (U_FAILURE(status)) { 543 delete newEntry; 544 delete newData; 545 546 return NULL; 547 } 548 549 return newData; 550 } 551 } 552 553 result = entry->data; 554 entry->refCount += 1; 555 umtx_unlock(&lock); 556 557 if (key != keyBuffer) { 558 deleteKey(key); 559 } 560 561 if (newEntry != NULL) { 562 delete newEntry; 563 delete newData; 564 } 565 566 return result; 567 } 568 569 void CollDataCache::unref(CollData *collData) 570 { 571 CollDataCacheEntry *entry = NULL; 572 573 umtx_lock(&lock); 574 entry = (CollDataCacheEntry *) uhash_get(cache, collData->key); 575 576 if (entry != NULL) { 577 entry->refCount -= 1; 578 } 579 umtx_unlock(&lock); 580 } 581 582 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength) 583 { 584 UErrorCode status = U_ZERO_ERROR; 585 int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status); 586 587 if (len >= *keyBufferLength) { 588 *keyBufferLength = (len + 2) & ~1; // round to even length, leaving room for terminating null 589 keyBuffer = NEW_ARRAY(char, *keyBufferLength); 590 status = U_ZERO_ERROR; 591 592 len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status); 593 } 594 595 keyBuffer[len] = '\0'; 596 597 return keyBuffer; 598 } 599 600 void CollDataCache::flush() 601 { 602 const UHashElement *element; 603 int32_t pos = -1; 604 605 umtx_lock(&lock); 606 while ((element = uhash_nextElement(cache, &pos)) != NULL) { 607 CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer; 608 609 if (entry->refCount <= 0) { 610 uhash_removeElement(cache, element); 611 } 612 } 613 umtx_unlock(&lock); 614 } 615 616 void CollDataCache::deleteKey(char *key) 617 { 618 DELETE_ARRAY(key); 619 } 620 621 U_CDECL_BEGIN 622 static UBool coll_data_cleanup(void) { 623 CollData::freeCollDataCache(); 624 return TRUE; 625 } 626 U_CDECL_END 627 628 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData) 629 630 CollData::CollData() 631 { 632 // nothing 633 } 634 635 #define CLONE_COLLATOR 636 637 //#define CACHE_CELISTS 638 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status) 639 : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL) 640 { 641 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]] 642 // i.e. other, control, private use, format, surrogate 643 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20); 644 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20); 645 USet *charsToTest = uset_openPattern(test_pattern, 20, &status); 646 647 // Han ext. A, Han, Jamo, Hangul, Han Ext. B 648 // i.e. all the characers we handle implicitly 649 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); 650 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); 651 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status); 652 653 if (U_FAILURE(status)) { 654 return; 655 } 656 657 USet *expansions = uset_openEmpty(); 658 USet *contractions = uset_openEmpty(); 659 int32_t itemCount; 660 661 #ifdef CACHE_CELISTS 662 charsToCEList = new StringToCEsMap(status); 663 664 if (U_FAILURE(status)) { 665 goto bail; 666 } 667 #else 668 charsToCEList = NULL; 669 #endif 670 671 ceToCharsStartingWith = new CEToStringsMap(status); 672 673 if (U_FAILURE(status)) { 674 goto bail; 675 } 676 677 if (cacheKeyLength > KEY_BUFFER_SIZE) { 678 key = NEW_ARRAY(char, cacheKeyLength); 679 680 if (key == NULL) { 681 status = U_MEMORY_ALLOCATION_ERROR; 682 goto bail; 683 } 684 } else { 685 key = keyBuffer; 686 } 687 688 ARRAY_COPY(key, cacheKey, cacheKeyLength); 689 690 #ifdef CLONE_COLLATOR 691 coll = ucol_safeClone(collator, NULL, NULL, &status); 692 693 if (U_FAILURE(status)) { 694 goto bail; 695 } 696 #else 697 coll = collator; 698 #endif 699 700 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); 701 702 uset_addAll(charsToTest, contractions); 703 uset_addAll(charsToTest, expansions); 704 uset_removeAll(charsToTest, charsToRemove); 705 706 itemCount = uset_getItemCount(charsToTest); 707 for(int32_t item = 0; item < itemCount; item += 1) { 708 UChar32 start = 0, end = 0; 709 UChar buffer[16]; 710 int32_t len = uset_getItem(charsToTest, item, &start, &end, 711 buffer, 16, &status); 712 713 if (len == 0) { 714 for (UChar32 ch = start; ch <= end; ch += 1) { 715 UnicodeString *st = new UnicodeString(ch); 716 717 if (st == NULL) { 718 status = U_MEMORY_ALLOCATION_ERROR; 719 break; 720 } 721 722 CEList *ceList = new CEList(coll, *st, status); 723 724 ceToCharsStartingWith->put(ceList->get(0), st, status); 725 726 #ifdef CACHE_CELISTS 727 charsToCEList->put(st, ceList, status); 728 #else 729 delete ceList; 730 delete st; 731 #endif 732 } 733 } else if (len > 0) { 734 UnicodeString *st = new UnicodeString(buffer, len); 735 736 if (st == NULL) { 737 status = U_MEMORY_ALLOCATION_ERROR; 738 break; 739 } 740 741 CEList *ceList = new CEList(coll, *st, status); 742 743 ceToCharsStartingWith->put(ceList->get(0), st, status); 744 745 #ifdef CACHE_CELISTS 746 charsToCEList->put(st, ceList, status); 747 #else 748 delete ceList; 749 delete st; 750 #endif 751 } else { 752 // shouldn't happen... 753 } 754 755 if (U_FAILURE(status)) { 756 break; 757 } 758 } 759 760 bail: 761 uset_close(contractions); 762 uset_close(expansions); 763 uset_close(charsToRemove); 764 uset_close(charsToTest); 765 766 if (U_FAILURE(status)) { 767 return; 768 } 769 770 UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A, 771 UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B}; 772 UChar jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO}; 773 UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges)); 774 UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges)); 775 CEList hanList(coll, hanString, status); 776 CEList jamoList(coll, jamoString, status); 777 int32_t j = 0; 778 779 if (U_FAILURE(status)) { 780 return; 781 } 782 783 for (int32_t c = 0; c < jamoList.size(); c += 1) { 784 uint32_t jce = jamoList[c]; 785 786 if (! isContinuation(jce)) { 787 jamoLimits[j++] = jce; 788 } 789 } 790 791 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT); 792 793 minHan = 0xFFFFFFFF; 794 maxHan = 0; 795 796 for(int32_t h = 0; h < hanList.size(); h += 2) { 797 uint32_t han = (uint32_t) hanList[h]; 798 799 if (han < minHan) { 800 minHan = han; 801 } 802 803 if (han > maxHan) { 804 maxHan = han; 805 } 806 } 807 808 maxHan += (1 << UCOL_PRIMARYORDERSHIFT); 809 } 810 811 CollData::~CollData() 812 { 813 #ifdef CLONE_COLLATOR 814 ucol_close(coll); 815 #endif 816 817 if (key != keyBuffer) { 818 DELETE_ARRAY(key); 819 } 820 821 delete ceToCharsStartingWith; 822 823 #ifdef CACHE_CELISTS 824 delete charsToCEList; 825 #endif 826 } 827 828 UCollator *CollData::getCollator() const 829 { 830 return coll; 831 } 832 833 const StringList *CollData::getStringList(int32_t ce) const 834 { 835 return ceToCharsStartingWith->getStringList(ce); 836 } 837 838 const CEList *CollData::getCEList(const UnicodeString *string) const 839 { 840 #ifdef CACHE_CELISTS 841 return charsToCEList->get(string); 842 #else 843 UErrorCode status = U_ZERO_ERROR; 844 const CEList *list = new CEList(coll, *string, status); 845 846 if (U_FAILURE(status)) { 847 delete list; 848 list = NULL; 849 } 850 851 return list; 852 #endif 853 } 854 855 void CollData::freeCEList(const CEList *list) 856 { 857 #ifndef CACHE_CELISTS 858 delete list; 859 #endif 860 } 861 862 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const 863 { 864 // find out shortest string for the longest sequence of ces. 865 // this can probably be folded with the minLengthCache... 866 867 if (history[offset] >= 0) { 868 return history[offset]; 869 } 870 871 uint32_t ce = ceList->get(offset); 872 int32_t maxOffset = ceList->size(); 873 int32_t shortestLength = INT32_MAX; 874 const StringList *strings = ceToCharsStartingWith->getStringList(ce); 875 876 if (strings != NULL) { 877 int32_t stringCount = strings->size(); 878 879 for (int32_t s = 0; s < stringCount; s += 1) { 880 const UnicodeString *string = strings->get(s); 881 #ifdef CACHE_CELISTS 882 const CEList *ceList2 = charsToCEList->get(string); 883 #else 884 UErrorCode status = U_ZERO_ERROR; 885 const CEList *ceList2 = new CEList(coll, *string, status); 886 887 if (U_FAILURE(status)) { 888 delete ceList2; 889 ceList2 = NULL; 890 } 891 #endif 892 893 if (ceList->matchesAt(offset, ceList2)) { 894 int32_t clength = ceList2->size(); 895 int32_t slength = string->length(); 896 int32_t roffset = offset + clength; 897 int32_t rlength = 0; 898 899 if (roffset < maxOffset) { 900 rlength = minLengthInChars(ceList, roffset, history); 901 902 if (rlength <= 0) { 903 // delete before continue to avoid memory leak. 904 #ifndef CACHE_CELISTS 905 delete ceList2; 906 #endif 907 // ignore any dead ends 908 continue; 909 } 910 } 911 912 if (shortestLength > slength + rlength) { 913 shortestLength = slength + rlength; 914 } 915 } 916 917 #ifndef CACHE_CELISTS 918 delete ceList2; 919 #endif 920 } 921 } 922 923 if (shortestLength == INT32_MAX) { 924 // No matching strings at this offset. See if 925 // the CE is in a range we can handle manually. 926 if (ce >= minHan && ce < maxHan) { 927 // all han have implicit orders which 928 // generate two CEs. 929 int32_t roffset = offset + 2; 930 int32_t rlength = 0; 931 932 //history[roffset++] = -1; 933 //history[roffset++] = 1; 934 935 if (roffset < maxOffset) { 936 rlength = minLengthInChars(ceList, roffset, history); 937 } 938 939 if (rlength < 0) { 940 return -1; 941 } 942 943 shortestLength = 1 + rlength; 944 goto have_shortest; 945 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) { 946 int32_t roffset = offset; 947 int32_t rlength = 0; 948 949 // **** this loop may not handle archaic Hangul correctly **** 950 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) { 951 uint32_t jce = ceList->get(roffset); 952 953 // Some Jamo have 24-bit primary order; skip the 954 // 2nd CE. This should always be OK because if 955 // we're still in the loop all we've seen are 956 // a series of Jamo in LVT order. 957 if (isContinuation(jce)) { 958 continue; 959 } 960 961 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) { 962 break; 963 } 964 } 965 966 if (roffset == offset) { 967 // we started with a non-L Jamo... 968 // just say it comes from a single character 969 roffset += 1; 970 971 // See if the single Jamo has a 24-bit order. 972 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) { 973 roffset += 1; 974 } 975 } 976 977 if (roffset < maxOffset) { 978 rlength = minLengthInChars(ceList, roffset, history); 979 } 980 981 if (rlength < 0) { 982 return -1; 983 } 984 985 shortestLength = 1 + rlength; 986 goto have_shortest; 987 } 988 989 // Can't handle it manually either. Just move on. 990 return -1; 991 } 992 993 have_shortest: 994 history[offset] = shortestLength; 995 996 return shortestLength; 997 } 998 999 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const 1000 { 1001 int32_t clength = ceList->size(); 1002 int32_t *history = NEW_ARRAY(int32_t, clength); 1003 1004 for (int32_t i = 0; i < clength; i += 1) { 1005 history[i] = -1; 1006 } 1007 1008 int32_t minLength = minLengthInChars(ceList, offset, history); 1009 1010 DELETE_ARRAY(history); 1011 1012 return minLength; 1013 } 1014 1015 CollData *CollData::open(UCollator *collator, UErrorCode &status) 1016 { 1017 if (U_FAILURE(status)) { 1018 return NULL; 1019 } 1020 1021 CollDataCache *cache = getCollDataCache(); 1022 1023 return cache->get(collator, status); 1024 } 1025 1026 void CollData::close(CollData *collData) 1027 { 1028 CollDataCache *cache = getCollDataCache(); 1029 1030 cache->unref(collData); 1031 } 1032 1033 CollDataCache *CollData::collDataCache = NULL; 1034 1035 CollDataCache *CollData::getCollDataCache() 1036 { 1037 UErrorCode status = U_ZERO_ERROR; 1038 CollDataCache *cache = NULL; 1039 1040 UMTX_CHECK(NULL, collDataCache, cache); 1041 1042 if (cache == NULL) { 1043 cache = new CollDataCache(status); 1044 1045 if (U_FAILURE(status)) { 1046 delete cache; 1047 return NULL; 1048 } 1049 1050 umtx_lock(NULL); 1051 if (collDataCache == NULL) { 1052 collDataCache = cache; 1053 1054 ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup); 1055 } 1056 umtx_unlock(NULL); 1057 1058 if (collDataCache != cache) { 1059 delete cache; 1060 } 1061 } 1062 1063 return collDataCache; 1064 } 1065 1066 void CollData::freeCollDataCache() 1067 { 1068 CollDataCache *cache = NULL; 1069 1070 UMTX_CHECK(NULL, collDataCache, cache); 1071 1072 if (cache != NULL) { 1073 umtx_lock(NULL); 1074 if (collDataCache != NULL) { 1075 collDataCache = NULL; 1076 } else { 1077 cache = NULL; 1078 } 1079 umtx_unlock(NULL); 1080 1081 delete cache; 1082 } 1083 } 1084 1085 void CollData::flushCollDataCache() 1086 { 1087 CollDataCache *cache = NULL; 1088 1089 UMTX_CHECK(NULL, collDataCache, cache); 1090 1091 // **** this will fail if the another **** 1092 // **** thread deletes the cache here **** 1093 if (cache != NULL) { 1094 cache->flush(); 1095 } 1096 } 1097 1098 U_NAMESPACE_END 1099 1100 #endif // #if !UCONFIG_NO_COLLATION 1101