1 /* 2 ****************************************************************************** 3 * Copyright (C) 1996-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ****************************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_COLLATION 11 12 #include "unicode/unistr.h" 13 #include "unicode/usearch.h" 14 15 #include "cmemory.h" 16 #include "unicode/coll.h" 17 #include "unicode/tblcoll.h" 18 #include "unicode/coleitr.h" 19 #include "unicode/ucoleitr.h" 20 21 #include "unicode/regex.h" // TODO: make conditional on regexp being built. 22 23 #include "unicode/uniset.h" 24 #include "unicode/uset.h" 25 #include "unicode/usetiter.h" 26 #include "unicode/ustring.h" 27 #include "hash.h" 28 #include "normalizer2impl.h" 29 #include "uhash.h" 30 #include "usrchimp.h" 31 #include "uassert.h" 32 33 #include "colldata.h" 34 35 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) 36 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) 37 #define DELETE_ARRAY(array) uprv_free((void *) (array)) 38 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0]) 39 40 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status) 41 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0) 42 { 43 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); 44 UCollationStrength strength = ucol_getStrength(coll); 45 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; 46 uint32_t variableTop = ucol_getVariableTop(coll, &status); 47 uint32_t strengthMask = 0; 48 int32_t order; 49 50 if (U_FAILURE(status)) { 51 return; 52 } 53 54 // **** only set flag if string has Han(gul) **** 55 // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476 56 57 switch (strength) 58 { 59 default: 60 strengthMask |= UCOL_TERTIARYORDERMASK; 61 /* fall through */ 62 63 case UCOL_SECONDARY: 64 strengthMask |= UCOL_SECONDARYORDERMASK; 65 /* fall through */ 66 67 case UCOL_PRIMARY: 68 strengthMask |= UCOL_PRIMARYORDERMASK; 69 } 70 71 ces = ceBuffer; 72 73 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) { 74 UBool cont = isContinuation(order); 75 76 order &= strengthMask; 77 78 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) { 79 if (strength >= UCOL_QUATERNARY) { 80 order &= UCOL_PRIMARYORDERMASK; 81 } else { 82 order = UCOL_IGNORABLE; 83 } 84 } 85 86 if (order == UCOL_IGNORABLE) { 87 continue; 88 } 89 90 if (cont) { 91 order |= UCOL_CONTINUATION_MARKER; 92 } 93 94 add(order, status); 95 } 96 97 ucol_closeElements(elems); 98 } 99 100 CEList::~CEList() 101 { 102 if (ces != ceBuffer) { 103 DELETE_ARRAY(ces); 104 } 105 } 106 107 void CEList::add(uint32_t ce, UErrorCode &status) 108 { 109 if (U_FAILURE(status)) { 110 return; 111 } 112 113 if (listSize >= listMax) { 114 int32_t newMax = listMax + CELIST_BUFFER_SIZE; 115 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax); 116 117 if (newCEs == NULL) { 118 status = U_MEMORY_ALLOCATION_ERROR; 119 return; 120 } 121 122 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t)); 123 124 if (ces != ceBuffer) { 125 DELETE_ARRAY(ces); 126 } 127 128 ces = newCEs; 129 listMax = newMax; 130 } 131 132 ces[listSize++] = ce; 133 } 134 135 uint32_t CEList::get(int32_t index) const 136 { 137 if (index >= 0 && index < listSize) { 138 return ces[index]; 139 } 140 141 return (uint32_t)UCOL_NULLORDER; 142 } 143 144 uint32_t &CEList::operator[](int32_t index) const 145 { 146 return ces[index]; 147 } 148 149 UBool CEList::matchesAt(int32_t offset, const CEList *other) const 150 { 151 if (other == NULL || listSize - offset < other->size()) { 152 return FALSE; 153 } 154 155 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) { 156 if (ces[i] != (*other)[j]) { 157 return FALSE; 158 } 159 } 160 161 return TRUE; 162 } 163 164 int32_t CEList::size() const 165 { 166 return listSize; 167 } 168 169 StringList::StringList(UErrorCode &status) 170 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0) 171 { 172 if (U_FAILURE(status)) { 173 return; 174 } 175 176 strings = new UnicodeString [listMax]; 177 178 if (strings == NULL) { 179 status = U_MEMORY_ALLOCATION_ERROR; 180 return; 181 } 182 } 183 184 StringList::~StringList() 185 { 186 delete[] strings; 187 } 188 189 void StringList::add(const UnicodeString *string, UErrorCode &status) 190 { 191 if (U_FAILURE(status)) { 192 return; 193 } 194 if (listSize >= listMax) { 195 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE; 196 UnicodeString *newStrings = new UnicodeString[newMax]; 197 if (newStrings == NULL) { 198 status = U_MEMORY_ALLOCATION_ERROR; 199 return; 200 } 201 for (int32_t i=0; i<listSize; ++i) { 202 newStrings[i] = strings[i]; 203 } 204 delete[] strings; 205 strings = newStrings; 206 listMax = newMax; 207 } 208 209 // The ctor initialized all the strings in 210 // the array to empty strings, so this 211 // is the same as copying the source string. 212 strings[listSize++].append(*string); 213 } 214 215 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status) 216 { 217 const UnicodeString string(chars, count); 218 219 add(&string, status); 220 } 221 222 const UnicodeString *StringList::get(int32_t index) const 223 { 224 if (index >= 0 && index < listSize) { 225 return &strings[index]; 226 } 227 228 return NULL; 229 } 230 231 int32_t StringList::size() const 232 { 233 return listSize; 234 } 235 236 237 U_CDECL_BEGIN 238 static void U_CALLCONV 239 deleteStringList(void *obj) 240 { 241 StringList *strings = (StringList *) obj; 242 243 delete strings; 244 } 245 U_CDECL_END 246 247 class CEToStringsMap 248 { 249 public: 250 CEToStringsMap(UErrorCode &status); 251 ~CEToStringsMap(); 252 253 void put(uint32_t ce, UnicodeString *string, UErrorCode &status); 254 StringList *getStringList(uint32_t ce) const; 255 256 private: 257 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status); 258 UHashtable *map; 259 }; 260 261 CEToStringsMap::CEToStringsMap(UErrorCode &status) 262 : map(NULL) 263 { 264 if (U_FAILURE(status)) { 265 return; 266 } 267 268 map = uhash_open(uhash_hashLong, uhash_compareLong, 269 uhash_compareCaselessUnicodeString, 270 &status); 271 272 if (U_FAILURE(status)) { 273 return; 274 } 275 276 uhash_setValueDeleter(map, deleteStringList); 277 } 278 279 CEToStringsMap::~CEToStringsMap() 280 { 281 uhash_close(map); 282 } 283 284 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status) 285 { 286 StringList *strings = getStringList(ce); 287 288 if (strings == NULL) { 289 strings = new StringList(status); 290 291 if (strings == NULL || U_FAILURE(status)) { 292 status = U_MEMORY_ALLOCATION_ERROR; 293 return; 294 } 295 296 putStringList(ce, strings, status); 297 } 298 299 strings->add(string, status); 300 } 301 302 StringList *CEToStringsMap::getStringList(uint32_t ce) const 303 { 304 return (StringList *) uhash_iget(map, ce); 305 } 306 307 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status) 308 { 309 uhash_iput(map, ce, (void *) stringList, &status); 310 } 311 312 #define CLONE_COLLATOR 313 314 CollData::CollData(UCollator *collator, UErrorCode &status) 315 : coll(NULL), ceToCharsStartingWith(NULL) 316 { 317 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]] 318 // i.e. other, control, private use, format, surrogate 319 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20); 320 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20); 321 USet *charsToTest = uset_openPattern(test_pattern, 20, &status); 322 323 // Han ext. A, Han, Jamo, Hangul, Han Ext. B 324 // i.e. all the characers we handle implicitly 325 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); 326 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); 327 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status); 328 329 if (U_FAILURE(status)) { 330 return; 331 } 332 333 USet *expansions = uset_openEmpty(); 334 USet *contractions = uset_openEmpty(); 335 int32_t itemCount; 336 337 ceToCharsStartingWith = new CEToStringsMap(status); 338 339 if (U_FAILURE(status)) { 340 goto bail; 341 } 342 343 #ifdef CLONE_COLLATOR 344 coll = ucol_safeClone(collator, NULL, NULL, &status); 345 346 if (U_FAILURE(status)) { 347 goto bail; 348 } 349 #else 350 coll = collator; 351 #endif 352 353 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); 354 355 uset_addAll(charsToTest, contractions); 356 uset_addAll(charsToTest, expansions); 357 uset_removeAll(charsToTest, charsToRemove); 358 359 itemCount = uset_getItemCount(charsToTest); 360 for(int32_t item = 0; item < itemCount; item += 1) { 361 UChar32 start = 0, end = 0; 362 UChar buffer[16]; 363 int32_t len = uset_getItem(charsToTest, item, &start, &end, 364 buffer, 16, &status); 365 366 if (len == 0) { 367 for (UChar32 ch = start; ch <= end; ch += 1) { 368 UnicodeString *st = new UnicodeString(ch); 369 370 if (st == NULL) { 371 status = U_MEMORY_ALLOCATION_ERROR; 372 break; 373 } 374 375 CEList *ceList = new CEList(coll, *st, status); 376 377 ceToCharsStartingWith->put(ceList->get(0), st, status); 378 379 delete ceList; 380 delete st; 381 } 382 } else if (len > 0) { 383 UnicodeString *st = new UnicodeString(buffer, len); 384 385 if (st == NULL) { 386 status = U_MEMORY_ALLOCATION_ERROR; 387 break; 388 } 389 390 CEList *ceList = new CEList(coll, *st, status); 391 392 ceToCharsStartingWith->put(ceList->get(0), st, status); 393 394 delete ceList; 395 delete st; 396 } else { 397 // shouldn't happen... 398 } 399 400 if (U_FAILURE(status)) { 401 break; 402 } 403 } 404 405 bail: 406 uset_close(contractions); 407 uset_close(expansions); 408 uset_close(charsToRemove); 409 uset_close(charsToTest); 410 411 if (U_FAILURE(status)) { 412 return; 413 } 414 415 UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status); 416 if (U_FAILURE(status)) { 417 return; 418 } 419 UnicodeSetIterator hanIter(hanRanges); 420 UnicodeString hanString; 421 while(hanIter.nextRange()) { 422 hanString.append(hanIter.getCodepoint()); 423 hanString.append(hanIter.getCodepointEnd()); 424 } 425 // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9, 426 // but as of Unicode 6.3 the 11xx block is filled, 427 // and there are also more Jamo T at U+D7CB..U+D7FB. 428 // Maybe use [:HST=T:] and look for the end of the last range? 429 // Maybe use script boundary mappings instead of this code?? 430 UChar jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF}; 431 UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges)); 432 CEList hanList(coll, hanString, status); 433 CEList jamoList(coll, jamoString, status); 434 int32_t j = 0; 435 436 if (U_FAILURE(status)) { 437 return; 438 } 439 440 for (int32_t c = 0; c < jamoList.size(); c += 1) { 441 uint32_t jce = jamoList[c]; 442 443 if (! isContinuation(jce)) { 444 jamoLimits[j++] = jce; 445 } 446 } 447 448 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT); 449 450 minHan = 0xFFFFFFFF; 451 maxHan = 0; 452 453 for(int32_t h = 0; h < hanList.size(); h += 2) { 454 uint32_t han = (uint32_t) hanList[h]; 455 456 if (han < minHan) { 457 minHan = han; 458 } 459 460 if (han > maxHan) { 461 maxHan = han; 462 } 463 } 464 465 maxHan += (1 << UCOL_PRIMARYORDERSHIFT); 466 } 467 468 CollData::~CollData() 469 { 470 #ifdef CLONE_COLLATOR 471 ucol_close(coll); 472 #endif 473 474 delete ceToCharsStartingWith; 475 } 476 477 UCollator *CollData::getCollator() const 478 { 479 return coll; 480 } 481 482 const StringList *CollData::getStringList(int32_t ce) const 483 { 484 return ceToCharsStartingWith->getStringList(ce); 485 } 486 487 const CEList *CollData::getCEList(const UnicodeString *string) const 488 { 489 UErrorCode status = U_ZERO_ERROR; 490 const CEList *list = new CEList(coll, *string, status); 491 492 if (U_FAILURE(status)) { 493 delete list; 494 list = NULL; 495 } 496 497 return list; 498 } 499 500 void CollData::freeCEList(const CEList *list) 501 { 502 delete list; 503 } 504 505 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const 506 { 507 // find out shortest string for the longest sequence of ces. 508 // this can probably be folded with the minLengthCache... 509 510 if (history[offset] >= 0) { 511 return history[offset]; 512 } 513 514 uint32_t ce = ceList->get(offset); 515 int32_t maxOffset = ceList->size(); 516 int32_t shortestLength = INT32_MAX; 517 const StringList *strings = ceToCharsStartingWith->getStringList(ce); 518 519 if (strings != NULL) { 520 int32_t stringCount = strings->size(); 521 522 for (int32_t s = 0; s < stringCount; s += 1) { 523 const UnicodeString *string = strings->get(s); 524 UErrorCode status = U_ZERO_ERROR; 525 const CEList *ceList2 = new CEList(coll, *string, status); 526 527 if (U_FAILURE(status)) { 528 delete ceList2; 529 ceList2 = NULL; 530 } 531 532 if (ceList->matchesAt(offset, ceList2)) { 533 U_ASSERT(ceList2 != NULL); 534 int32_t clength = ceList2->size(); 535 int32_t slength = string->length(); 536 int32_t roffset = offset + clength; 537 int32_t rlength = 0; 538 539 if (roffset < maxOffset) { 540 rlength = minLengthInChars(ceList, roffset, history); 541 542 if (rlength <= 0) { 543 // delete before continue to avoid memory leak. 544 delete ceList2; 545 546 // ignore any dead ends 547 continue; 548 } 549 } 550 551 if (shortestLength > slength + rlength) { 552 shortestLength = slength + rlength; 553 } 554 } 555 556 delete ceList2; 557 } 558 } 559 560 if (shortestLength == INT32_MAX) { 561 // No matching strings at this offset. See if 562 // the CE is in a range we can handle manually. 563 if (ce >= minHan && ce < maxHan) { 564 // all han have implicit orders which 565 // generate two CEs. 566 int32_t roffset = offset + 2; 567 int32_t rlength = 0; 568 569 //history[roffset++] = -1; 570 //history[roffset++] = 1; 571 572 if (roffset < maxOffset) { 573 rlength = minLengthInChars(ceList, roffset, history); 574 } 575 576 if (rlength < 0) { 577 return -1; 578 } 579 580 shortestLength = 1 + rlength; 581 goto have_shortest; 582 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) { 583 int32_t roffset = offset; 584 int32_t rlength = 0; 585 586 // **** this loop may not handle archaic Hangul correctly **** 587 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) { 588 uint32_t jce = ceList->get(roffset); 589 590 // Some Jamo have 24-bit primary order; skip the 591 // 2nd CE. This should always be OK because if 592 // we're still in the loop all we've seen are 593 // a series of Jamo in LVT order. 594 if (isContinuation(jce)) { 595 continue; 596 } 597 598 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) { 599 break; 600 } 601 } 602 603 if (roffset == offset) { 604 // we started with a non-L Jamo... 605 // just say it comes from a single character 606 roffset += 1; 607 608 // See if the single Jamo has a 24-bit order. 609 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) { 610 roffset += 1; 611 } 612 } 613 614 if (roffset < maxOffset) { 615 rlength = minLengthInChars(ceList, roffset, history); 616 } 617 618 if (rlength < 0) { 619 return -1; 620 } 621 622 shortestLength = 1 + rlength; 623 goto have_shortest; 624 } 625 626 // Can't handle it manually either. Just move on. 627 return -1; 628 } 629 630 have_shortest: 631 history[offset] = shortestLength; 632 633 return shortestLength; 634 } 635 636 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const 637 { 638 int32_t clength = ceList->size(); 639 int32_t *history = NEW_ARRAY(int32_t, clength); 640 641 for (int32_t i = 0; i < clength; i += 1) { 642 history[i] = -1; 643 } 644 645 int32_t minLength = minLengthInChars(ceList, offset, history); 646 647 DELETE_ARRAY(history); 648 649 return minLength; 650 } 651 652 #endif // #if !UCONFIG_NO_COLLATION 653