1 /* 2 ****************************************************************************** 3 * Copyright (C) 1996-2011, International Business Machines * 4 * Corporation and others. All Rights Reserved. * 5 ****************************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_COLLATION 11 12 #include "unicode/unistr.h" 13 #include "unicode/putil.h" 14 #include "unicode/usearch.h" 15 16 #include "cmemory.h" 17 #include "unicode/coll.h" 18 #include "unicode/tblcoll.h" 19 #include "unicode/coleitr.h" 20 #include "unicode/ucoleitr.h" 21 22 #include "unicode/regex.h" // TODO: make conditional on regexp being built. 23 24 #include "unicode/uniset.h" 25 #include "unicode/uset.h" 26 #include "unicode/ustring.h" 27 #include "hash.h" 28 #include "uhash.h" 29 #include "ucln_in.h" 30 #include "ucol_imp.h" 31 #include "umutex.h" 32 33 #include "unicode/colldata.h" 34 35 U_NAMESPACE_BEGIN 36 37 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) 38 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) 39 #define DELETE_ARRAY(array) uprv_free((void *) (array)) 40 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0]) 41 42 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CEList) 43 44 #ifdef INSTRUMENT_CELIST 45 int32_t CEList::_active = 0; 46 int32_t CEList::_histogram[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; 47 #endif 48 49 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status) 50 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0) 51 { 52 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); 53 UCollationStrength strength = ucol_getStrength(coll); 54 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; 55 uint32_t variableTop = ucol_getVariableTop(coll, &status); 56 uint32_t strengthMask = 0; 57 int32_t order; 58 59 if (U_FAILURE(status)) { 60 return; 61 } 62 63 // **** only set flag if string has Han(gul) **** 64 ucol_forceHanImplicit(elems, &status); 65 66 switch (strength) 67 { 68 default: 69 strengthMask |= UCOL_TERTIARYORDERMASK; 70 /* fall through */ 71 72 case UCOL_SECONDARY: 73 strengthMask |= UCOL_SECONDARYORDERMASK; 74 /* fall through */ 75 76 case UCOL_PRIMARY: 77 strengthMask |= UCOL_PRIMARYORDERMASK; 78 } 79 80 #ifdef INSTRUMENT_CELIST 81 _active += 1; 82 _histogram[0] += 1; 83 #endif 84 85 ces = ceBuffer; 86 87 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) { 88 UBool cont = isContinuation(order); 89 90 order &= strengthMask; 91 92 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) { 93 if (strength >= UCOL_QUATERNARY) { 94 order &= UCOL_PRIMARYORDERMASK; 95 } else { 96 order = UCOL_IGNORABLE; 97 } 98 } 99 100 if (order == UCOL_IGNORABLE) { 101 continue; 102 } 103 104 if (cont) { 105 order |= UCOL_CONTINUATION_MARKER; 106 } 107 108 add(order, status); 109 } 110 111 ucol_closeElements(elems); 112 } 113 114 CEList::~CEList() 115 { 116 #ifdef INSTRUMENT_CELIST 117 _active -= 1; 118 #endif 119 120 if (ces != ceBuffer) { 121 DELETE_ARRAY(ces); 122 } 123 } 124 125 void CEList::add(uint32_t ce, UErrorCode &status) 126 { 127 if (U_FAILURE(status)) { 128 return; 129 } 130 131 if (listSize >= listMax) { 132 int32_t newMax = listMax + CELIST_BUFFER_SIZE; 133 134 #ifdef INSTRUMENT_CELIST 135 _histogram[listSize / CELIST_BUFFER_SIZE] += 1; 136 #endif 137 138 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax); 139 140 if (newCEs == NULL) { 141 status = U_MEMORY_ALLOCATION_ERROR; 142 return; 143 } 144 145 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t)); 146 147 if (ces != ceBuffer) { 148 DELETE_ARRAY(ces); 149 } 150 151 ces = newCEs; 152 listMax = newMax; 153 } 154 155 ces[listSize++] = ce; 156 } 157 158 uint32_t CEList::get(int32_t index) const 159 { 160 if (index >= 0 && index < listSize) { 161 return ces[index]; 162 } 163 164 return UCOL_NULLORDER; 165 } 166 167 uint32_t &CEList::operator[](int32_t index) const 168 { 169 return ces[index]; 170 } 171 172 UBool CEList::matchesAt(int32_t offset, const CEList *other) const 173 { 174 if (other == NULL || listSize - offset < other->size()) { 175 return FALSE; 176 } 177 178 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) { 179 if (ces[i] != (*other)[j]) { 180 return FALSE; 181 } 182 } 183 184 return TRUE; 185 } 186 187 int32_t CEList::size() const 188 { 189 return listSize; 190 } 191 192 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(StringList) 193 194 #ifdef INSTRUMENT_STRING_LIST 195 int32_t StringList::_lists = 0; 196 int32_t StringList::_strings = 0; 197 int32_t StringList::_histogram[101] = {0}; 198 #endif 199 200 StringList::StringList(UErrorCode &status) 201 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0) 202 { 203 if (U_FAILURE(status)) { 204 return; 205 } 206 207 strings = new UnicodeString [listMax]; 208 209 if (strings == NULL) { 210 status = U_MEMORY_ALLOCATION_ERROR; 211 return; 212 } 213 214 #ifdef INSTRUMENT_STRING_LIST 215 _lists += 1; 216 _histogram[0] += 1; 217 #endif 218 } 219 220 StringList::~StringList() 221 { 222 delete[] strings; 223 } 224 225 void StringList::add(const UnicodeString *string, UErrorCode &status) 226 { 227 if (U_FAILURE(status)) { 228 return; 229 } 230 231 #ifdef INSTRUMENT_STRING_LIST 232 _strings += 1; 233 #endif 234 235 if (listSize >= listMax) { 236 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE; 237 UnicodeString *newStrings = new UnicodeString[newMax]; 238 if (newStrings == NULL) { 239 status = U_MEMORY_ALLOCATION_ERROR; 240 return; 241 } 242 for (int32_t i=0; i<listSize; ++i) { 243 newStrings[i] = strings[i]; 244 } 245 246 #ifdef INSTRUMENT_STRING_LIST 247 int32_t _h = listSize / STRING_LIST_BUFFER_SIZE; 248 249 if (_h > 100) { 250 _h = 100; 251 } 252 253 _histogram[_h] += 1; 254 #endif 255 256 delete[] strings; 257 strings = newStrings; 258 listMax = newMax; 259 } 260 261 // The ctor initialized all the strings in 262 // the array to empty strings, so this 263 // is the same as copying the source string. 264 strings[listSize++].append(*string); 265 } 266 267 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status) 268 { 269 const UnicodeString string(chars, count); 270 271 add(&string, status); 272 } 273 274 const UnicodeString *StringList::get(int32_t index) const 275 { 276 if (index >= 0 && index < listSize) { 277 return &strings[index]; 278 } 279 280 return NULL; 281 } 282 283 int32_t StringList::size() const 284 { 285 return listSize; 286 } 287 288 289 U_CFUNC void deleteStringList(void *obj); 290 291 class CEToStringsMap : public UMemory 292 { 293 public: 294 295 CEToStringsMap(UErrorCode &status); 296 ~CEToStringsMap(); 297 298 void put(uint32_t ce, UnicodeString *string, UErrorCode &status); 299 StringList *getStringList(uint32_t ce) const; 300 301 private: 302 303 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status); 304 UHashtable *map; 305 }; 306 307 CEToStringsMap::CEToStringsMap(UErrorCode &status) 308 : map(NULL) 309 { 310 if (U_FAILURE(status)) { 311 return; 312 } 313 314 map = uhash_open(uhash_hashLong, uhash_compareLong, 315 uhash_compareCaselessUnicodeString, 316 &status); 317 318 if (U_FAILURE(status)) { 319 return; 320 } 321 322 uhash_setValueDeleter(map, deleteStringList); 323 } 324 325 CEToStringsMap::~CEToStringsMap() 326 { 327 uhash_close(map); 328 } 329 330 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status) 331 { 332 StringList *strings = getStringList(ce); 333 334 if (strings == NULL) { 335 strings = new StringList(status); 336 337 if (strings == NULL || U_FAILURE(status)) { 338 status = U_MEMORY_ALLOCATION_ERROR; 339 return; 340 } 341 342 putStringList(ce, strings, status); 343 } 344 345 strings->add(string, status); 346 } 347 348 StringList *CEToStringsMap::getStringList(uint32_t ce) const 349 { 350 return (StringList *) uhash_iget(map, ce); 351 } 352 353 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status) 354 { 355 uhash_iput(map, ce, (void *) stringList, &status); 356 } 357 358 U_CFUNC void deleteStringList(void *obj) 359 { 360 StringList *strings = (StringList *) obj; 361 362 delete strings; 363 } 364 365 U_CFUNC void deleteCEList(void *obj); 366 U_CFUNC void deleteUnicodeStringKey(void *obj); 367 368 class StringToCEsMap : public UMemory 369 { 370 public: 371 StringToCEsMap(UErrorCode &status); 372 ~StringToCEsMap(); 373 374 void put(const UnicodeString *string, const CEList *ces, UErrorCode &status); 375 const CEList *get(const UnicodeString *string); 376 void free(const CEList *list); 377 378 private: 379 380 381 UHashtable *map; 382 }; 383 384 StringToCEsMap::StringToCEsMap(UErrorCode &status) 385 : map(NULL) 386 { 387 if (U_FAILURE(status)) { 388 return; 389 } 390 391 map = uhash_open(uhash_hashUnicodeString, 392 uhash_compareUnicodeString, 393 uhash_compareLong, 394 &status); 395 396 if (U_FAILURE(status)) { 397 return; 398 } 399 400 uhash_setValueDeleter(map, deleteCEList); 401 uhash_setKeyDeleter(map, deleteUnicodeStringKey); 402 } 403 404 StringToCEsMap::~StringToCEsMap() 405 { 406 uhash_close(map); 407 } 408 409 void StringToCEsMap::put(const UnicodeString *string, const CEList *ces, UErrorCode &status) 410 { 411 uhash_put(map, (void *) string, (void *) ces, &status); 412 } 413 414 const CEList *StringToCEsMap::get(const UnicodeString *string) 415 { 416 return (const CEList *) uhash_get(map, string); 417 } 418 419 U_CFUNC void deleteCEList(void *obj) 420 { 421 CEList *list = (CEList *) obj; 422 423 delete list; 424 } 425 426 U_CFUNC void deleteUnicodeStringKey(void *obj) 427 { 428 UnicodeString *key = (UnicodeString *) obj; 429 430 delete key; 431 } 432 433 class CollDataCacheEntry : public UMemory 434 { 435 public: 436 CollDataCacheEntry(CollData *theData); 437 ~CollDataCacheEntry(); 438 439 CollData *data; 440 int32_t refCount; 441 }; 442 443 CollDataCacheEntry::CollDataCacheEntry(CollData *theData) 444 : data(theData), refCount(1) 445 { 446 // nothing else to do 447 } 448 449 CollDataCacheEntry::~CollDataCacheEntry() 450 { 451 // check refCount? 452 delete data; 453 } 454 455 class CollDataCache : public UMemory 456 { 457 public: 458 CollDataCache(UErrorCode &status); 459 ~CollDataCache(); 460 461 CollData *get(UCollator *collator, UErrorCode &status); 462 void unref(CollData *collData); 463 464 void flush(); 465 466 private: 467 static char *getKey(UCollator *collator, char *keyBuffer, int32_t *charBufferLength); 468 static void deleteKey(char *key); 469 470 UHashtable *cache; 471 }; 472 static UMTX lock; 473 474 U_CFUNC void deleteChars(void * /*obj*/) 475 { 476 // char *chars = (char *) obj; 477 // All the key strings are owned by the 478 // CollData objects and don't need to 479 // be freed here. 480 //DELETE_ARRAY(chars); 481 } 482 483 U_CFUNC void deleteCollDataCacheEntry(void *obj) 484 { 485 CollDataCacheEntry *entry = (CollDataCacheEntry *) obj; 486 487 delete entry; 488 } 489 490 CollDataCache::CollDataCache(UErrorCode &status) 491 : cache(NULL) 492 { 493 if (U_FAILURE(status)) { 494 return; 495 } 496 497 cache = uhash_open(uhash_hashChars, uhash_compareChars, uhash_compareLong, &status); 498 499 if (U_FAILURE(status)) { 500 return; 501 } 502 503 uhash_setValueDeleter(cache, deleteCollDataCacheEntry); 504 uhash_setKeyDeleter(cache, deleteChars); 505 } 506 507 CollDataCache::~CollDataCache() 508 { 509 umtx_lock(&lock); 510 uhash_close(cache); 511 cache = NULL; 512 umtx_unlock(&lock); 513 } 514 515 CollData *CollDataCache::get(UCollator *collator, UErrorCode &status) 516 { 517 char keyBuffer[KEY_BUFFER_SIZE]; 518 int32_t keyLength = KEY_BUFFER_SIZE; 519 char *key = getKey(collator, keyBuffer, &keyLength); 520 CollData *result = NULL, *newData = NULL; 521 CollDataCacheEntry *entry = NULL, *newEntry = NULL; 522 523 umtx_lock(&lock); 524 entry = (CollDataCacheEntry *) uhash_get(cache, key); 525 526 if (entry == NULL) { 527 umtx_unlock(&lock); 528 529 newData = new CollData(collator, key, keyLength, status); 530 newEntry = new CollDataCacheEntry(newData); 531 532 if (U_FAILURE(status) || newData == NULL || newEntry == NULL) { 533 status = U_MEMORY_ALLOCATION_ERROR; 534 return NULL; 535 } 536 537 umtx_lock(&lock); 538 entry = (CollDataCacheEntry *) uhash_get(cache, key); 539 540 if (entry == NULL) { 541 uhash_put(cache, newData->key, newEntry, &status); 542 umtx_unlock(&lock); 543 544 if (U_FAILURE(status)) { 545 delete newEntry; 546 delete newData; 547 548 return NULL; 549 } 550 551 return newData; 552 } 553 } 554 555 result = entry->data; 556 entry->refCount += 1; 557 umtx_unlock(&lock); 558 559 if (key != keyBuffer) { 560 deleteKey(key); 561 } 562 563 if (newEntry != NULL) { 564 delete newEntry; 565 delete newData; 566 } 567 568 return result; 569 } 570 571 void CollDataCache::unref(CollData *collData) 572 { 573 CollDataCacheEntry *entry = NULL; 574 575 umtx_lock(&lock); 576 entry = (CollDataCacheEntry *) uhash_get(cache, collData->key); 577 578 if (entry != NULL) { 579 entry->refCount -= 1; 580 } 581 umtx_unlock(&lock); 582 } 583 584 char *CollDataCache::getKey(UCollator *collator, char *keyBuffer, int32_t *keyBufferLength) 585 { 586 UErrorCode status = U_ZERO_ERROR; 587 int32_t len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status); 588 589 if (len >= *keyBufferLength) { 590 *keyBufferLength = (len + 2) & ~1; // round to even length, leaving room for terminating null 591 keyBuffer = NEW_ARRAY(char, *keyBufferLength); 592 status = U_ZERO_ERROR; 593 594 len = ucol_getShortDefinitionString(collator, NULL, keyBuffer, *keyBufferLength, &status); 595 } 596 597 keyBuffer[len] = '\0'; 598 599 return keyBuffer; 600 } 601 602 void CollDataCache::flush() 603 { 604 const UHashElement *element; 605 int32_t pos = -1; 606 607 umtx_lock(&lock); 608 while ((element = uhash_nextElement(cache, &pos)) != NULL) { 609 CollDataCacheEntry *entry = (CollDataCacheEntry *) element->value.pointer; 610 611 if (entry->refCount <= 0) { 612 uhash_removeElement(cache, element); 613 } 614 } 615 umtx_unlock(&lock); 616 } 617 618 void CollDataCache::deleteKey(char *key) 619 { 620 DELETE_ARRAY(key); 621 } 622 623 U_CDECL_BEGIN 624 static UBool coll_data_cleanup(void) { 625 CollData::freeCollDataCache(); 626 return TRUE; 627 } 628 U_CDECL_END 629 630 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CollData) 631 632 CollData::CollData() 633 { 634 // nothing 635 } 636 637 #define CLONE_COLLATOR 638 639 //#define CACHE_CELISTS 640 CollData::CollData(UCollator *collator, char *cacheKey, int32_t cacheKeyLength, UErrorCode &status) 641 : coll(NULL), charsToCEList(NULL), ceToCharsStartingWith(NULL), key(NULL) 642 { 643 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]] 644 // i.e. other, control, private use, format, surrogate 645 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20); 646 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20); 647 USet *charsToTest = uset_openPattern(test_pattern, 20, &status); 648 649 // Han ext. A, Han, Jamo, Hangul, Han Ext. B 650 // i.e. all the characers we handle implicitly 651 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); 652 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); 653 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status); 654 655 if (U_FAILURE(status)) { 656 return; 657 } 658 659 USet *expansions = uset_openEmpty(); 660 USet *contractions = uset_openEmpty(); 661 int32_t itemCount; 662 663 #ifdef CACHE_CELISTS 664 charsToCEList = new StringToCEsMap(status); 665 666 if (U_FAILURE(status)) { 667 goto bail; 668 } 669 #else 670 charsToCEList = NULL; 671 #endif 672 673 ceToCharsStartingWith = new CEToStringsMap(status); 674 675 if (U_FAILURE(status)) { 676 goto bail; 677 } 678 679 if (cacheKeyLength > KEY_BUFFER_SIZE) { 680 key = NEW_ARRAY(char, cacheKeyLength); 681 682 if (key == NULL) { 683 status = U_MEMORY_ALLOCATION_ERROR; 684 goto bail; 685 } 686 } else { 687 key = keyBuffer; 688 } 689 690 ARRAY_COPY(key, cacheKey, cacheKeyLength); 691 692 #ifdef CLONE_COLLATOR 693 coll = ucol_safeClone(collator, NULL, NULL, &status); 694 695 if (U_FAILURE(status)) { 696 goto bail; 697 } 698 #else 699 coll = collator; 700 #endif 701 702 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); 703 704 uset_addAll(charsToTest, contractions); 705 uset_addAll(charsToTest, expansions); 706 uset_removeAll(charsToTest, charsToRemove); 707 708 itemCount = uset_getItemCount(charsToTest); 709 for(int32_t item = 0; item < itemCount; item += 1) { 710 UChar32 start = 0, end = 0; 711 UChar buffer[16]; 712 int32_t len = uset_getItem(charsToTest, item, &start, &end, 713 buffer, 16, &status); 714 715 if (len == 0) { 716 for (UChar32 ch = start; ch <= end; ch += 1) { 717 UnicodeString *st = new UnicodeString(ch); 718 719 if (st == NULL) { 720 status = U_MEMORY_ALLOCATION_ERROR; 721 break; 722 } 723 724 CEList *ceList = new CEList(coll, *st, status); 725 726 ceToCharsStartingWith->put(ceList->get(0), st, status); 727 728 #ifdef CACHE_CELISTS 729 charsToCEList->put(st, ceList, status); 730 #else 731 delete ceList; 732 delete st; 733 #endif 734 } 735 } else if (len > 0) { 736 UnicodeString *st = new UnicodeString(buffer, len); 737 738 if (st == NULL) { 739 status = U_MEMORY_ALLOCATION_ERROR; 740 break; 741 } 742 743 CEList *ceList = new CEList(coll, *st, status); 744 745 ceToCharsStartingWith->put(ceList->get(0), st, status); 746 747 #ifdef CACHE_CELISTS 748 charsToCEList->put(st, ceList, status); 749 #else 750 delete ceList; 751 delete st; 752 #endif 753 } else { 754 // shouldn't happen... 755 } 756 757 if (U_FAILURE(status)) { 758 break; 759 } 760 } 761 762 bail: 763 uset_close(contractions); 764 uset_close(expansions); 765 uset_close(charsToRemove); 766 uset_close(charsToTest); 767 768 if (U_FAILURE(status)) { 769 return; 770 } 771 772 UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A, 773 UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B}; 774 UChar jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO}; 775 UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges)); 776 UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges)); 777 CEList hanList(coll, hanString, status); 778 CEList jamoList(coll, jamoString, status); 779 int32_t j = 0; 780 781 if (U_FAILURE(status)) { 782 return; 783 } 784 785 for (int32_t c = 0; c < jamoList.size(); c += 1) { 786 uint32_t jce = jamoList[c]; 787 788 if (! isContinuation(jce)) { 789 jamoLimits[j++] = jce; 790 } 791 } 792 793 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT); 794 795 minHan = 0xFFFFFFFF; 796 maxHan = 0; 797 798 for(int32_t h = 0; h < hanList.size(); h += 2) { 799 uint32_t han = (uint32_t) hanList[h]; 800 801 if (han < minHan) { 802 minHan = han; 803 } 804 805 if (han > maxHan) { 806 maxHan = han; 807 } 808 } 809 810 maxHan += (1 << UCOL_PRIMARYORDERSHIFT); 811 } 812 813 CollData::~CollData() 814 { 815 #ifdef CLONE_COLLATOR 816 ucol_close(coll); 817 #endif 818 819 if (key != keyBuffer) { 820 DELETE_ARRAY(key); 821 } 822 823 delete ceToCharsStartingWith; 824 825 #ifdef CACHE_CELISTS 826 delete charsToCEList; 827 #endif 828 } 829 830 UCollator *CollData::getCollator() const 831 { 832 return coll; 833 } 834 835 const StringList *CollData::getStringList(int32_t ce) const 836 { 837 return ceToCharsStartingWith->getStringList(ce); 838 } 839 840 const CEList *CollData::getCEList(const UnicodeString *string) const 841 { 842 #ifdef CACHE_CELISTS 843 return charsToCEList->get(string); 844 #else 845 UErrorCode status = U_ZERO_ERROR; 846 const CEList *list = new CEList(coll, *string, status); 847 848 if (U_FAILURE(status)) { 849 delete list; 850 list = NULL; 851 } 852 853 return list; 854 #endif 855 } 856 857 void CollData::freeCEList(const CEList *list) 858 { 859 #ifndef CACHE_CELISTS 860 delete list; 861 #endif 862 } 863 864 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const 865 { 866 // find out shortest string for the longest sequence of ces. 867 // this can probably be folded with the minLengthCache... 868 869 if (history[offset] >= 0) { 870 return history[offset]; 871 } 872 873 uint32_t ce = ceList->get(offset); 874 int32_t maxOffset = ceList->size(); 875 int32_t shortestLength = INT32_MAX; 876 const StringList *strings = ceToCharsStartingWith->getStringList(ce); 877 878 if (strings != NULL) { 879 int32_t stringCount = strings->size(); 880 881 for (int32_t s = 0; s < stringCount; s += 1) { 882 const UnicodeString *string = strings->get(s); 883 #ifdef CACHE_CELISTS 884 const CEList *ceList2 = charsToCEList->get(string); 885 #else 886 UErrorCode status = U_ZERO_ERROR; 887 const CEList *ceList2 = new CEList(coll, *string, status); 888 889 if (U_FAILURE(status)) { 890 delete ceList2; 891 ceList2 = NULL; 892 } 893 #endif 894 895 if (ceList->matchesAt(offset, ceList2)) { 896 int32_t clength = ceList2->size(); 897 int32_t slength = string->length(); 898 int32_t roffset = offset + clength; 899 int32_t rlength = 0; 900 901 if (roffset < maxOffset) { 902 rlength = minLengthInChars(ceList, roffset, history); 903 904 if (rlength <= 0) { 905 // delete before continue to avoid memory leak. 906 #ifndef CACHE_CELISTS 907 delete ceList2; 908 #endif 909 // ignore any dead ends 910 continue; 911 } 912 } 913 914 if (shortestLength > slength + rlength) { 915 shortestLength = slength + rlength; 916 } 917 } 918 919 #ifndef CACHE_CELISTS 920 delete ceList2; 921 #endif 922 } 923 } 924 925 if (shortestLength == INT32_MAX) { 926 // No matching strings at this offset. See if 927 // the CE is in a range we can handle manually. 928 if (ce >= minHan && ce < maxHan) { 929 // all han have implicit orders which 930 // generate two CEs. 931 int32_t roffset = offset + 2; 932 int32_t rlength = 0; 933 934 //history[roffset++] = -1; 935 //history[roffset++] = 1; 936 937 if (roffset < maxOffset) { 938 rlength = minLengthInChars(ceList, roffset, history); 939 } 940 941 if (rlength < 0) { 942 return -1; 943 } 944 945 shortestLength = 1 + rlength; 946 goto have_shortest; 947 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) { 948 int32_t roffset = offset; 949 int32_t rlength = 0; 950 951 // **** this loop may not handle archaic Hangul correctly **** 952 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) { 953 uint32_t jce = ceList->get(roffset); 954 955 // Some Jamo have 24-bit primary order; skip the 956 // 2nd CE. This should always be OK because if 957 // we're still in the loop all we've seen are 958 // a series of Jamo in LVT order. 959 if (isContinuation(jce)) { 960 continue; 961 } 962 963 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) { 964 break; 965 } 966 } 967 968 if (roffset == offset) { 969 // we started with a non-L Jamo... 970 // just say it comes from a single character 971 roffset += 1; 972 973 // See if the single Jamo has a 24-bit order. 974 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) { 975 roffset += 1; 976 } 977 } 978 979 if (roffset < maxOffset) { 980 rlength = minLengthInChars(ceList, roffset, history); 981 } 982 983 if (rlength < 0) { 984 return -1; 985 } 986 987 shortestLength = 1 + rlength; 988 goto have_shortest; 989 } 990 991 // Can't handle it manually either. Just move on. 992 return -1; 993 } 994 995 have_shortest: 996 history[offset] = shortestLength; 997 998 return shortestLength; 999 } 1000 1001 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const 1002 { 1003 int32_t clength = ceList->size(); 1004 int32_t *history = NEW_ARRAY(int32_t, clength); 1005 1006 for (int32_t i = 0; i < clength; i += 1) { 1007 history[i] = -1; 1008 } 1009 1010 int32_t minLength = minLengthInChars(ceList, offset, history); 1011 1012 DELETE_ARRAY(history); 1013 1014 return minLength; 1015 } 1016 1017 CollData *CollData::open(UCollator *collator, UErrorCode &status) 1018 { 1019 if (U_FAILURE(status)) { 1020 return NULL; 1021 } 1022 1023 CollDataCache *cache = getCollDataCache(); 1024 1025 return cache->get(collator, status); 1026 } 1027 1028 void CollData::close(CollData *collData) 1029 { 1030 CollDataCache *cache = getCollDataCache(); 1031 1032 cache->unref(collData); 1033 } 1034 1035 CollDataCache *CollData::collDataCache = NULL; 1036 1037 CollDataCache *CollData::getCollDataCache() 1038 { 1039 UErrorCode status = U_ZERO_ERROR; 1040 CollDataCache *cache = NULL; 1041 1042 UMTX_CHECK(NULL, collDataCache, cache); 1043 1044 if (cache == NULL) { 1045 cache = new CollDataCache(status); 1046 1047 if (U_FAILURE(status)) { 1048 delete cache; 1049 return NULL; 1050 } 1051 1052 umtx_lock(NULL); 1053 if (collDataCache == NULL) { 1054 collDataCache = cache; 1055 1056 ucln_i18n_registerCleanup(UCLN_I18N_COLL_DATA, coll_data_cleanup); 1057 } 1058 umtx_unlock(NULL); 1059 1060 if (collDataCache != cache) { 1061 delete cache; 1062 } 1063 } 1064 1065 return collDataCache; 1066 } 1067 1068 void CollData::freeCollDataCache() 1069 { 1070 CollDataCache *cache = NULL; 1071 1072 UMTX_CHECK(NULL, collDataCache, cache); 1073 1074 if (cache != NULL) { 1075 umtx_lock(NULL); 1076 if (collDataCache != NULL) { 1077 collDataCache = NULL; 1078 } else { 1079 cache = NULL; 1080 } 1081 umtx_unlock(NULL); 1082 1083 delete cache; 1084 } 1085 } 1086 1087 void CollData::flushCollDataCache() 1088 { 1089 CollDataCache *cache = NULL; 1090 1091 UMTX_CHECK(NULL, collDataCache, cache); 1092 1093 // **** this will fail if the another **** 1094 // **** thread deletes the cache here **** 1095 if (cache != NULL) { 1096 cache->flush(); 1097 } 1098 } 1099 1100 U_NAMESPACE_END 1101 1102 #endif // #if !UCONFIG_NO_COLLATION 1103