1 /* 2 ****************************************************************************** 3 * Copyright (C) 1996-2012, International Business Machines * 4 * Corporation and others. All Rights Reserved. * 5 ****************************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_COLLATION 11 12 #include "unicode/unistr.h" 13 #include "unicode/putil.h" 14 #include "unicode/usearch.h" 15 16 #include "cmemory.h" 17 #include "unicode/coll.h" 18 #include "unicode/tblcoll.h" 19 #include "unicode/coleitr.h" 20 #include "unicode/ucoleitr.h" 21 22 #include "unicode/regex.h" // TODO: make conditional on regexp being built. 23 24 #include "unicode/uniset.h" 25 #include "unicode/uset.h" 26 #include "unicode/ustring.h" 27 #include "hash.h" 28 #include "uhash.h" 29 #include "ucln_in.h" 30 #include "ucol_imp.h" 31 #include "umutex.h" 32 #include "uassert.h" 33 34 #include "unicode/colldata.h" 35 36 U_NAMESPACE_BEGIN 37 38 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) 39 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) 40 #define DELETE_ARRAY(array) uprv_free((void *) (array)) 41 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0]) 42 43 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList) 44 45 #ifdef INSTRUMENT_CELIST 46 int32_t CEList::_active = 0; 47 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 48 #endif 49 50 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status) 51 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0) 52 { 53 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); 54 UCollationStrength strength = ucol_getStrength(coll); 55 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; 56 uint32_t variableTop = ucol_getVariableTop(coll, &status); 57 uint32_t strengthMask = 0; 58 int32_t order; 59 60 if (U_FAILURE(status)) { 61 return; 62 } 63 64 // **** only set flag if string has Han(gul) **** 65 ucol_forceHanImplicit(elems, &status); 66 67 switch (strength) 68 { 69 default: 70 strengthMask |= UCOL_TERTIARYORDERMASK; 71 /* fall through */ 72 73 case UCOL_SECONDARY: 74 strengthMask |= UCOL_SECONDARYORDERMASK; 75 /* fall through */ 76 77 case UCOL_PRIMARY: 78 strengthMask |= UCOL_PRIMARYORDERMASK; 79 } 80 81 #ifdef INSTRUMENT_CELIST 82 _active += 1; 83 _histogram[0] += 1; 84 #endif 85 86 ces = ceBuffer; 87 88 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) { 89 UBool cont = isContinuation(order); 90 91 order &= strengthMask; 92 93 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) { 94 if (strength >= UCOL_QUATERNARY) { 95 order &= UCOL_PRIMARYORDERMASK; 96 } else { 97 order = UCOL_IGNORABLE; 98 } 99 } 100 101 if (order == UCOL_IGNORABLE) { 102 continue; 103 } 104 105 if (cont) { 106 order |= UCOL_CONTINUATION_MARKER; 107 } 108 109 add(order, status); 110 } 111 112 ucol_closeElements(elems); 113 } 114 115 CEList::~CEList() 116 { 117 #ifdef INSTRUMENT_CELIST 118 _active -= 1; 119 #endif 120 121 if (ces != ceBuffer) { 122 DELETE_ARRAY(ces); 123 } 124 } 125 126 void CEList::add(uint32_t ce, UErrorCode &status) 127 { 128 if (U_FAILURE(status)) { 129 return; 130 } 131 132 if (listSize >= listMax) { 133 int32_t newMax = listMax + CELIST_BUFFER_SIZE; 134 135 #ifdef INSTRUMENT_CELIST 136 _histogram[listSize / CELIST_BUFFER_SIZE] += 1; 137 #endif 138 139 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax); 140 141 if (newCEs == NULL) { 142 status = U_MEMORY_ALLOCATION_ERROR; 143 return; 144 } 145 146 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t)); 147 148 if (ces != ceBuffer) { 149 DELETE_ARRAY(ces); 150 } 151 152 ces = newCEs; 153 listMax = newMax; 154 } 155 156 ces[listSize++] = ce; 157 } 158 159 uint32_t CEList::get(int32_t index) const 160 { 161 if (index >= 0 && index < listSize) { 162 return ces[index]; 163 } 164 165 return (uint32_t)UCOL_NULLORDER; 166 } 167 168 uint32_t &CEList::operator[](int32_t index) const 169 { 170 return ces[index]; 171 } 172 173 UBool CEList::matchesAt(int32_t offset, const CEList *other) const 174 { 175 if (other == NULL || listSize - offset < other->size()) { 176 return FALSE; 177 } 178 179 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) { 180 if (ces[i] != (*other)[j]) { 181 return FALSE; 182 } 183 } 184 185 return TRUE; 186 } 187 188 int32_t CEList::size() const 189 { 190 return listSize; 191 } 192 193 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList) 194 195 #ifdef INSTRUMENT_STRING_LIST 196 int32_t StringList::_lists = 0; 197 int32_t StringList::_strings = 0; 198 int32_t StringList::_histogram[101] = {0}; 199 #endif 200 201 StringList::StringList(UErrorCode &status) 202 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0) 203 { 204 if (U_FAILURE(status)) { 205 return; 206 } 207 208 strings = new UnicodeString [listMax]; 209 210 if (strings == NULL) { 211 status = U_MEMORY_ALLOCATION_ERROR; 212 return; 213 } 214 215 #ifdef INSTRUMENT_STRING_LIST 216 _lists += 1; 217 _histogram[0] += 1; 218 #endif 219 } 220 221 StringList::~StringList() 222 { 223 delete[] strings; 224 } 225 226 void StringList::add(const UnicodeString *string, UErrorCode &status) 227 { 228 if (U_FAILURE(status)) { 229 return; 230 } 231 232 #ifdef INSTRUMENT_STRING_LIST 233 _strings += 1; 234 #endif 235 236 if (listSize >= listMax) { 237 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE; 238 UnicodeString *newStrings = new UnicodeString[newMax]; 239 if (newStrings == NULL) { 240 status = U_MEMORY_ALLOCATION_ERROR; 241 return; 242 } 243 for (int32_t i=0; i<listSize; ++i) { 244 newStrings[i] = strings[i]; 245 } 246 247 #ifdef INSTRUMENT_STRING_LIST 248 int32_t _h = listSize / STRING_LIST_BUFFER_SIZE; 249 250 if (_h > 100) { 251 _h = 100; 252 } 253 254 _histogram[_h] += 1; 255 #endif 256 257 delete[] strings; 258 strings = newStrings; 259 listMax = newMax; 260 } 261 262 // The ctor initialized all the strings in 263 // the array to empty strings, so this 264 // is the same as copying the source string. 265 strings[listSize++].append(*string); 266 } 267 268 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status) 269 { 270 const UnicodeString string(chars, count); 271 272 add(&string, status); 273 } 274 275 const UnicodeString *StringList::get(int32_t index) const 276 { 277 if (index >= 0 && index < listSize) { 278 return &strings[index]; 279 } 280 281 return NULL; 282 } 283 284 int32_t StringList::size() const 285 { 286 return listSize; 287 } 288 289 290 U_CDECL_BEGIN 291 static void U_CALLCONV 292 deleteStringList(void *obj) 293 { 294 StringList *strings = (StringList *) obj; 295 296 delete strings; 297 } 298 static void U_CALLCONV 299 deleteCEList(void *obj) 300 { 301 CEList *list = (CEList *) obj; 302 303 delete list; 304 } 305 306 static void U_CALLCONV 307 deleteUnicodeStringKey(void *obj) 308 { 309 UnicodeString *key = (UnicodeString *) obj; 310 311 delete key; 312 } 313 314 static void U_CALLCONV 315 deleteChars(void * /*obj*/) 316 { 317 // char *chars = (char *) obj; 318 // All the key strings are owned by the 319 // CollData objects and don't need to 320 // be freed here. 321 //DELETE_ARRAY(chars); 322 } 323 324 U_CDECL_END 325 326 class CEToStringsMap : public UMemory 327 { 328 public: 329 330 CEToStringsMap(UErrorCode &status); 331 ~CEToStringsMap(); 332 333 void put(uint32_t ce, UnicodeString *string, UErrorCode &status); 334 StringList *getStringList(uint32_t ce) const; 335 336 private: 337 338 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status); 339 UHashtable *map; 340 }; 341 342 CEToStringsMap::CEToStringsMap(UErrorCode &status) 343 : map(NULL) 344 { 345 if (U_FAILURE(status)) { 346 return; 347 } 348 349 map = uhash_open(uhash_hashLong, uhash_compareLong, 350 uhash_compareCaselessUnicodeString, 351 &status); 352 353 if (U_FAILURE(status)) { 354 return; 355 } 356 357 uhash_setValueDeleter(map, deleteStringList); 358 } 359 360 CEToStringsMap::~CEToStringsMap() 361 { 362 uhash_close(map); 363 } 364 365 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status) 366 { 367 StringList *strings = getStringList(ce); 368 369 if (strings == NULL) { 370 strings = new StringList(status); 371 372 if (strings == NULL || U_FAILURE(status)) { 373 status = U_MEMORY_ALLOCATION_ERROR; 374 return; 375 } 376 377 putStringList(ce, strings, status); 378 } 379 380 strings->add(string, status); 381 } 382 383 StringList *CEToStringsMap::getStringList(uint32_t ce) const 384 { 385 return (StringList *) uhash_iget(map, ce); 386 } 387 388 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status) 389 { 390 uhash_iput(map, ce, (void *) stringList, &status); 391 } 392 393 class StringToCEsMap : public UMemory 394 { 395 public: 396 StringToCEsMap(UErrorCode &status); 397 ~StringToCEsMap(); 398 399 void put(const UnicodeString *string, const CEList *ces, UErrorCode &status); 400 const CEList *get(const UnicodeString *string); 401 void free(const CEList *list); 402 403 private: 404 405 406 UHashtable *map; 407 }; 408 409 StringToCEsMap::StringToCEsMap(UErrorCode &status) 410 : map(NULL) 411 { 412 if (U_FAILURE(status)) { 413 return; 414 } 415 416 map = uhash_open(uhash_hashUnicodeString, 417 uhash_compareUnicodeString, 418 uhash_compareLong, 419 &status); 420 421 if (U_FAILURE(status)) { 422 return; 423 } 424 425 uhash_setValueDeleter(map, deleteCEList); 426 uhash_setKeyDeleter(map, deleteUnicodeStringKey); 427 } 428 429 StringToCEsMap::~StringToCEsMap() 430 { 431 uhash_close(map); 432 } 433 434 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status) 435 { 436 uhash_put(map, (void *) string, (void *) ces, &status); 437 } 438 439 const CEList *StringToCEsMap::get(const UnicodeString *string) 440 { 441 return (const CEList *) uhash_get(map, string); 442 } 443 444 class CollDataCacheEntry : public UMemory 445 { 446 public: 447 CollDataCacheEntry(CollData *theData); 448 ~CollDataCacheEntry(); 449 450 CollData *data; 451 int32_t refCount; 452 }; 453 454 CollDataCacheEntry::CollDataCacheEntry(CollData *theData) 455 : data(theData), refCount(1) 456 { 457 // nothing else to do 458 } 459 460 CollDataCacheEntry::~CollDataCacheEntry() 461 { 462 // check refCount? 463 delete data; 464 } 465 466 class CollDataCache : public UMemory 467 { 468 public: 469 CollDataCache(UErrorCode &status); 470 ~CollDataCache(); 471 472 CollData *get(UCollator *collator, UErrorCode &status); 473 void unref(CollData *collData); 474 475 void flush(); 476 477 private: 478 static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength); 479 static void deleteKey(char *key); 480 481 UHashtable *cache; 482 }; 483 static UMutex lock = U_MUTEX_INITIALIZER; 484 485 U_CDECL_BEGIN 486 static void U_CALLCONV 487 deleteCollDataCacheEntry(void *obj) 488 { 489 CollDataCacheEntry *entry = (CollDataCacheEntry *) obj; 490 491 delete entry; 492 } 493 U_CDECL_END 494 495 CollDataCache::CollDataCache(UErrorCode &status) 496 : cache(NULL) 497 { 498 if (U_FAILURE(status)) { 499 return; 500 } 501 502 cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status); 503 504 if (U_FAILURE(status)) { 505 return; 506 } 507 508 uhash_setValueDeleter(cache, deleteCollDataCacheEntry); 509 uhash_setKeyDeleter(cache, deleteChars); 510 } 511 512 CollDataCache::~CollDataCache() 513 { 514 umtx_lock(&lock); 515 uhash_close(cache); 516 cache = NULL; 517 umtx_unlock(&lock); 518 } 519 520 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status) 521 { 522 char keyBuffer[KEY_BUFFER_SIZE]; 523 int32_t keyLength = KEY_BUFFER_SIZE; 524 char *key = getKey(collator, keyBuffer, &keyLength); 525 CollData *result = NULL, *newData = NULL; 526 CollDataCacheEntry *entry = NULL, *newEntry = NULL; 527 528 umtx_lock(&lock); 529 entry = (CollDataCacheEntry *) uhash_get(cache, key); 530 531 if (entry == NULL) { 532 umtx_unlock(&lock); 533 534 newData = new CollData(collator, key, keyLength, status); 535 newEntry = new CollDataCacheEntry(newData); 536 537 if (U_FAILURE(status) || newData == NULL || newEntry == NULL) { 538 status = U_MEMORY_ALLOCATION_ERROR; 539 return NULL; 540 } 541 542 umtx_lock(&lock); 543 entry = (CollDataCacheEntry *) uhash_get(cache, key); 544 545 if (entry == NULL) { 546 uhash_put(cache, newData->key, newEntry, &status); 547 umtx_unlock(&lock); 548 549 if (U_FAILURE(status)) { 550 delete newEntry; 551 delete newData; 552 553 return NULL; 554 } 555 556 return newData; 557 } 558 } 559 560 result = entry->data; 561 entry->refCount += 1; 562 umtx_unlock(&lock); 563 564 if (key != keyBuffer) { 565 deleteKey(key); 566 } 567 568 if (newEntry != NULL) { 569 delete newEntry; 570 delete newData; 571 } 572 573 return result; 574 } 575 576 void CollDataCache::unref(CollData *collData) 577 { 578 CollDataCacheEntry *entry = NULL; 579 580 umtx_lock(&lock); 581 entry = (CollDataCacheEntry *) uhash_get(cache, collData->key); 582 583 if (entry != NULL) { 584 entry->refCount -= 1; 585 } 586 umtx_unlock(&lock); 587 } 588 589 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength) 590 { 591 UErrorCode status = U_ZERO_ERROR; 592 int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status); 593 594 if (len >= *keyBufferLength) { 595 *keyBufferLength = (len + 2) & ~1; // round to even length, leaving room for terminating null 596 keyBuffer = NEW_ARRAY(char, *keyBufferLength); 597 status = U_ZERO_ERROR; 598 599 len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status); 600 } 601 602 keyBuffer[len] = '\0'; 603 604 return keyBuffer; 605 } 606 607 void CollDataCache::flush() 608 { 609 const UHashElement *element; 610 int32_t pos = -1; 611 612 umtx_lock(&lock); 613 while ((element = uhash_nextElement(cache, &pos)) != NULL) { 614 CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer; 615 616 if (entry->refCount <= 0) { 617 uhash_removeElement(cache, element); 618 } 619 } 620 umtx_unlock(&lock); 621 } 622 623 void CollDataCache::deleteKey(char *key) 624 { 625 DELETE_ARRAY(key); 626 } 627 628 U_CDECL_BEGIN 629 static UBool coll_data_cleanup(void) { 630 CollData::freeCollDataCache(); 631 return TRUE; 632 } 633 U_CDECL_END 634 635 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData) 636 637 CollData::CollData() 638 { 639 // nothing 640 } 641 642 #define CLONE_COLLATOR 643 644 //#define CACHE_CELISTS 645 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status) 646 : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL) 647 { 648 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]] 649 // i.e. other, control, private use, format, surrogate 650 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20); 651 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20); 652 USet *charsToTest = uset_openPattern(test_pattern, 20, &status); 653 654 // Han ext. A, Han, Jamo, Hangul, Han Ext. B 655 // i.e. all the characers we handle implicitly 656 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); 657 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); 658 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status); 659 660 if (U_FAILURE(status)) { 661 return; 662 } 663 664 USet *expansions = uset_openEmpty(); 665 USet *contractions = uset_openEmpty(); 666 int32_t itemCount; 667 668 #ifdef CACHE_CELISTS 669 charsToCEList = new StringToCEsMap(status); 670 671 if (U_FAILURE(status)) { 672 goto bail; 673 } 674 #else 675 charsToCEList = NULL; 676 #endif 677 678 ceToCharsStartingWith = new CEToStringsMap(status); 679 680 if (U_FAILURE(status)) { 681 goto bail; 682 } 683 684 if (cacheKeyLength > KEY_BUFFER_SIZE) { 685 key = NEW_ARRAY(char, cacheKeyLength); 686 687 if (key == NULL) { 688 status = U_MEMORY_ALLOCATION_ERROR; 689 goto bail; 690 } 691 } else { 692 key = keyBuffer; 693 } 694 695 ARRAY_COPY(key, cacheKey, cacheKeyLength); 696 697 #ifdef CLONE_COLLATOR 698 coll = ucol_safeClone(collator, NULL, NULL, &status); 699 700 if (U_FAILURE(status)) { 701 goto bail; 702 } 703 #else 704 coll = collator; 705 #endif 706 707 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); 708 709 uset_addAll(charsToTest, contractions); 710 uset_addAll(charsToTest, expansions); 711 uset_removeAll(charsToTest, charsToRemove); 712 713 itemCount = uset_getItemCount(charsToTest); 714 for(int32_t item = 0; item < itemCount; item += 1) { 715 UChar32 start = 0, end = 0; 716 UChar buffer[16]; 717 int32_t len = uset_getItem(charsToTest, item, &start, &end, 718 buffer, 16, &status); 719 720 if (len == 0) { 721 for (UChar32 ch = start; ch <= end; ch += 1) { 722 UnicodeString *st = new UnicodeString(ch); 723 724 if (st == NULL) { 725 status = U_MEMORY_ALLOCATION_ERROR; 726 break; 727 } 728 729 CEList *ceList = new CEList(coll, *st, status); 730 731 ceToCharsStartingWith->put(ceList->get(0), st, status); 732 733 #ifdef CACHE_CELISTS 734 charsToCEList->put(st, ceList, status); 735 #else 736 delete ceList; 737 delete st; 738 #endif 739 } 740 } else if (len > 0) { 741 UnicodeString *st = new UnicodeString(buffer, len); 742 743 if (st == NULL) { 744 status = U_MEMORY_ALLOCATION_ERROR; 745 break; 746 } 747 748 CEList *ceList = new CEList(coll, *st, status); 749 750 ceToCharsStartingWith->put(ceList->get(0), st, status); 751 752 #ifdef CACHE_CELISTS 753 charsToCEList->put(st, ceList, status); 754 #else 755 delete ceList; 756 delete st; 757 #endif 758 } else { 759 // shouldn't happen... 760 } 761 762 if (U_FAILURE(status)) { 763 break; 764 } 765 } 766 767 bail: 768 uset_close(contractions); 769 uset_close(expansions); 770 uset_close(charsToRemove); 771 uset_close(charsToTest); 772 773 if (U_FAILURE(status)) { 774 return; 775 } 776 777 UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A, 778 UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B}; 779 UChar jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO}; 780 UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges)); 781 UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges)); 782 CEList hanList(coll, hanString, status); 783 CEList jamoList(coll, jamoString, status); 784 int32_t j = 0; 785 786 if (U_FAILURE(status)) { 787 return; 788 } 789 790 for (int32_t c = 0; c < jamoList.size(); c += 1) { 791 uint32_t jce = jamoList[c]; 792 793 if (! isContinuation(jce)) { 794 jamoLimits[j++] = jce; 795 } 796 } 797 798 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT); 799 800 minHan = 0xFFFFFFFF; 801 maxHan = 0; 802 803 for(int32_t h = 0; h < hanList.size(); h += 2) { 804 uint32_t han = (uint32_t) hanList[h]; 805 806 if (han < minHan) { 807 minHan = han; 808 } 809 810 if (han > maxHan) { 811 maxHan = han; 812 } 813 } 814 815 maxHan += (1 << UCOL_PRIMARYORDERSHIFT); 816 } 817 818 CollData::~CollData() 819 { 820 #ifdef CLONE_COLLATOR 821 ucol_close(coll); 822 #endif 823 824 if (key != keyBuffer) { 825 DELETE_ARRAY(key); 826 } 827 828 delete ceToCharsStartingWith; 829 830 #ifdef CACHE_CELISTS 831 delete charsToCEList; 832 #endif 833 } 834 835 UCollator *CollData::getCollator() const 836 { 837 return coll; 838 } 839 840 const StringList *CollData::getStringList(int32_t ce) const 841 { 842 return ceToCharsStartingWith->getStringList(ce); 843 } 844 845 const CEList *CollData::getCEList(const UnicodeString *string) const 846 { 847 #ifdef CACHE_CELISTS 848 return charsToCEList->get(string); 849 #else 850 UErrorCode status = U_ZERO_ERROR; 851 const CEList *list = new CEList(coll, *string, status); 852 853 if (U_FAILURE(status)) { 854 delete list; 855 list = NULL; 856 } 857 858 return list; 859 #endif 860 } 861 862 void CollData::freeCEList(const CEList *list) 863 { 864 #ifndef CACHE_CELISTS 865 delete list; 866 #endif 867 } 868 869 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const 870 { 871 // find out shortest string for the longest sequence of ces. 872 // this can probably be folded with the minLengthCache... 873 874 if (history[offset] >= 0) { 875 return history[offset]; 876 } 877 878 uint32_t ce = ceList->get(offset); 879 int32_t maxOffset = ceList->size(); 880 int32_t shortestLength = INT32_MAX; 881 const StringList *strings = ceToCharsStartingWith->getStringList(ce); 882 883 if (strings != NULL) { 884 int32_t stringCount = strings->size(); 885 886 for (int32_t s = 0; s < stringCount; s += 1) { 887 const UnicodeString *string = strings->get(s); 888 #ifdef CACHE_CELISTS 889 const CEList *ceList2 = charsToCEList->get(string); 890 #else 891 UErrorCode status = U_ZERO_ERROR; 892 const CEList *ceList2 = new CEList(coll, *string, status); 893 894 if (U_FAILURE(status)) { 895 delete ceList2; 896 ceList2 = NULL; 897 } 898 #endif 899 900 if (ceList->matchesAt(offset, ceList2)) { 901 U_ASSERT(ceList2 != NULL); 902 int32_t clength = ceList2->size(); 903 int32_t slength = string->length(); 904 int32_t roffset = offset + clength; 905 int32_t rlength = 0; 906 907 if (roffset < maxOffset) { 908 rlength = minLengthInChars(ceList, roffset, history); 909 910 if (rlength <= 0) { 911 // delete before continue to avoid memory leak. 912 #ifndef CACHE_CELISTS 913 delete ceList2; 914 #endif 915 // ignore any dead ends 916 continue; 917 } 918 } 919 920 if (shortestLength > slength + rlength) { 921 shortestLength = slength + rlength; 922 } 923 } 924 925 #ifndef CACHE_CELISTS 926 delete ceList2; 927 #endif 928 } 929 } 930 931 if (shortestLength == INT32_MAX) { 932 // No matching strings at this offset. See if 933 // the CE is in a range we can handle manually. 934 if (ce >= minHan && ce < maxHan) { 935 // all han have implicit orders which 936 // generate two CEs. 937 int32_t roffset = offset + 2; 938 int32_t rlength = 0; 939 940 //history[roffset++] = -1; 941 //history[roffset++] = 1; 942 943 if (roffset < maxOffset) { 944 rlength = minLengthInChars(ceList, roffset, history); 945 } 946 947 if (rlength < 0) { 948 return -1; 949 } 950 951 shortestLength = 1 + rlength; 952 goto have_shortest; 953 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) { 954 int32_t roffset = offset; 955 int32_t rlength = 0; 956 957 // **** this loop may not handle archaic Hangul correctly **** 958 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) { 959 uint32_t jce = ceList->get(roffset); 960 961 // Some Jamo have 24-bit primary order; skip the 962 // 2nd CE. This should always be OK because if 963 // we're still in the loop all we've seen are 964 // a series of Jamo in LVT order. 965 if (isContinuation(jce)) { 966 continue; 967 } 968 969 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) { 970 break; 971 } 972 } 973 974 if (roffset == offset) { 975 // we started with a non-L Jamo... 976 // just say it comes from a single character 977 roffset += 1; 978 979 // See if the single Jamo has a 24-bit order. 980 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) { 981 roffset += 1; 982 } 983 } 984 985 if (roffset < maxOffset) { 986 rlength = minLengthInChars(ceList, roffset, history); 987 } 988 989 if (rlength < 0) { 990 return -1; 991 } 992 993 shortestLength = 1 + rlength; 994 goto have_shortest; 995 } 996 997 // Can't handle it manually either. Just move on. 998 return -1; 999 } 1000 1001 have_shortest: 1002 history[offset] = shortestLength; 1003 1004 return shortestLength; 1005 } 1006 1007 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const 1008 { 1009 int32_t clength = ceList->size(); 1010 int32_t *history = NEW_ARRAY(int32_t, clength); 1011 1012 for (int32_t i = 0; i < clength; i += 1) { 1013 history[i] = -1; 1014 } 1015 1016 int32_t minLength = minLengthInChars(ceList, offset, history); 1017 1018 DELETE_ARRAY(history); 1019 1020 return minLength; 1021 } 1022 1023 CollData *CollData::open(UCollator *collator, UErrorCode &status) 1024 { 1025 if (U_FAILURE(status)) { 1026 return NULL; 1027 } 1028 1029 CollDataCache *cache = getCollDataCache(); 1030 1031 return cache->get(collator, status); 1032 } 1033 1034 void CollData::close(CollData *collData) 1035 { 1036 CollDataCache *cache = getCollDataCache(); 1037 1038 cache->unref(collData); 1039 } 1040 1041 CollDataCache *CollData::collDataCache = NULL; 1042 1043 CollDataCache *CollData::getCollDataCache() 1044 { 1045 UErrorCode status = U_ZERO_ERROR; 1046 CollDataCache *cache = NULL; 1047 1048 UMTX_CHECK(NULL, collDataCache, cache); 1049 1050 if (cache == NULL) { 1051 cache = new CollDataCache(status); 1052 1053 if (U_FAILURE(status)) { 1054 delete cache; 1055 return NULL; 1056 } 1057 1058 umtx_lock(NULL); 1059 if (collDataCache == NULL) { 1060 collDataCache = cache; 1061 1062 ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup); 1063 } 1064 umtx_unlock(NULL); 1065 1066 if (collDataCache != cache) { 1067 delete cache; 1068 } 1069 } 1070 1071 return collDataCache; 1072 } 1073 1074 void CollData::freeCollDataCache() 1075 { 1076 CollDataCache *cache = NULL; 1077 1078 UMTX_CHECK(NULL, collDataCache, cache); 1079 1080 if (cache != NULL) { 1081 umtx_lock(NULL); 1082 if (collDataCache != NULL) { 1083 collDataCache = NULL; 1084 } else { 1085 cache = NULL; 1086 } 1087 umtx_unlock(NULL); 1088 1089 delete cache; 1090 } 1091 } 1092 1093 void CollData::flushCollDataCache() 1094 { 1095 CollDataCache *cache = NULL; 1096 1097 UMTX_CHECK(NULL, collDataCache, cache); 1098 1099 // **** this will fail if the another **** 1100 // **** thread deletes the cache here **** 1101 if (cache != NULL) { 1102 cache->flush(); 1103 } 1104 } 1105 1106 U_NAMESPACE_END 1107 1108 #endif // #if !UCONFIG_NO_COLLATION 1109