1 /* 2 ****************************************************************************** 3 * Copyright (C) 1996-2012, International Business Machines * 4 * Corporation and others. All Rights Reserved. * 5 ****************************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_COLLATION 11 12 #include "unicode/unistr.h" 13 #include "unicode/usearch.h" 14 15 #include "cmemory.h" 16 #include "unicode/coll.h" 17 #include "unicode/tblcoll.h" 18 #include "unicode/coleitr.h" 19 #include "unicode/ucoleitr.h" 20 21 #include "unicode/regex.h" // TODO: make conditional on regexp being built. 22 23 #include "unicode/uniset.h" 24 #include "unicode/uset.h" 25 #include "unicode/ustring.h" 26 #include "hash.h" 27 #include "uhash.h" 28 #include "ucol_imp.h" 29 #include "uassert.h" 30 31 #include "colldata.h" 32 33 #define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0])) 34 #define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type)) 35 #define DELETE_ARRAY(array) uprv_free((void *) (array)) 36 #define ARRAY_COPY(dst, src, count) uprv_memcpy((void *) (dst), (void *) (src), (count) * sizeof (src)[0]) 37 38 CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status) 39 : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0) 40 { 41 UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); 42 UCollationStrength strength = ucol_getStrength(coll); 43 UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; 44 uint32_t variableTop = ucol_getVariableTop(coll, &status); 45 uint32_t strengthMask = 0; 46 int32_t order; 47 48 if (U_FAILURE(status)) { 49 return; 50 } 51 52 // **** only set flag if string has Han(gul) **** 53 ucol_forceHanImplicit(elems, &status); 54 55 switch (strength) 56 { 57 default: 58 strengthMask |= UCOL_TERTIARYORDERMASK; 59 /* fall through */ 60 61 case UCOL_SECONDARY: 62 strengthMask |= UCOL_SECONDARYORDERMASK; 63 /* fall through */ 64 65 case UCOL_PRIMARY: 66 strengthMask |= UCOL_PRIMARYORDERMASK; 67 } 68 69 ces = ceBuffer; 70 71 while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) { 72 UBool cont = isContinuation(order); 73 74 order &= strengthMask; 75 76 if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) { 77 if (strength >= UCOL_QUATERNARY) { 78 order &= UCOL_PRIMARYORDERMASK; 79 } else { 80 order = UCOL_IGNORABLE; 81 } 82 } 83 84 if (order == UCOL_IGNORABLE) { 85 continue; 86 } 87 88 if (cont) { 89 order |= UCOL_CONTINUATION_MARKER; 90 } 91 92 add(order, status); 93 } 94 95 ucol_closeElements(elems); 96 } 97 98 CEList::~CEList() 99 { 100 if (ces != ceBuffer) { 101 DELETE_ARRAY(ces); 102 } 103 } 104 105 void CEList::add(uint32_t ce, UErrorCode &status) 106 { 107 if (U_FAILURE(status)) { 108 return; 109 } 110 111 if (listSize >= listMax) { 112 int32_t newMax = listMax + CELIST_BUFFER_SIZE; 113 uint32_t *newCEs = NEW_ARRAY(uint32_t, newMax); 114 115 if (newCEs == NULL) { 116 status = U_MEMORY_ALLOCATION_ERROR; 117 return; 118 } 119 120 uprv_memcpy(newCEs, ces, listSize * sizeof(uint32_t)); 121 122 if (ces != ceBuffer) { 123 DELETE_ARRAY(ces); 124 } 125 126 ces = newCEs; 127 listMax = newMax; 128 } 129 130 ces[listSize++] = ce; 131 } 132 133 uint32_t CEList::get(int32_t index) const 134 { 135 if (index >= 0 && index < listSize) { 136 return ces[index]; 137 } 138 139 return (uint32_t)UCOL_NULLORDER; 140 } 141 142 uint32_t &CEList::operator[](int32_t index) const 143 { 144 return ces[index]; 145 } 146 147 UBool CEList::matchesAt(int32_t offset, const CEList *other) const 148 { 149 if (other == NULL || listSize - offset < other->size()) { 150 return FALSE; 151 } 152 153 for (int32_t i = offset, j = 0; j < other->size(); i += 1, j += 1) { 154 if (ces[i] != (*other)[j]) { 155 return FALSE; 156 } 157 } 158 159 return TRUE; 160 } 161 162 int32_t CEList::size() const 163 { 164 return listSize; 165 } 166 167 StringList::StringList(UErrorCode &status) 168 : strings(NULL), listMax(STRING_LIST_BUFFER_SIZE), listSize(0) 169 { 170 if (U_FAILURE(status)) { 171 return; 172 } 173 174 strings = new UnicodeString [listMax]; 175 176 if (strings == NULL) { 177 status = U_MEMORY_ALLOCATION_ERROR; 178 return; 179 } 180 } 181 182 StringList::~StringList() 183 { 184 delete[] strings; 185 } 186 187 void StringList::add(const UnicodeString *string, UErrorCode &status) 188 { 189 if (U_FAILURE(status)) { 190 return; 191 } 192 if (listSize >= listMax) { 193 int32_t newMax = listMax + STRING_LIST_BUFFER_SIZE; 194 UnicodeString *newStrings = new UnicodeString[newMax]; 195 if (newStrings == NULL) { 196 status = U_MEMORY_ALLOCATION_ERROR; 197 return; 198 } 199 for (int32_t i=0; i<listSize; ++i) { 200 newStrings[i] = strings[i]; 201 } 202 delete[] strings; 203 strings = newStrings; 204 listMax = newMax; 205 } 206 207 // The ctor initialized all the strings in 208 // the array to empty strings, so this 209 // is the same as copying the source string. 210 strings[listSize++].append(*string); 211 } 212 213 void StringList::add(const UChar *chars, int32_t count, UErrorCode &status) 214 { 215 const UnicodeString string(chars, count); 216 217 add(&string, status); 218 } 219 220 const UnicodeString *StringList::get(int32_t index) const 221 { 222 if (index >= 0 && index < listSize) { 223 return &strings[index]; 224 } 225 226 return NULL; 227 } 228 229 int32_t StringList::size() const 230 { 231 return listSize; 232 } 233 234 235 U_CDECL_BEGIN 236 static void U_CALLCONV 237 deleteStringList(void *obj) 238 { 239 StringList *strings = (StringList *) obj; 240 241 delete strings; 242 } 243 U_CDECL_END 244 245 class CEToStringsMap 246 { 247 public: 248 CEToStringsMap(UErrorCode &status); 249 ~CEToStringsMap(); 250 251 void put(uint32_t ce, UnicodeString *string, UErrorCode &status); 252 StringList *getStringList(uint32_t ce) const; 253 254 private: 255 void putStringList(uint32_t ce, StringList *stringList, UErrorCode &status); 256 UHashtable *map; 257 }; 258 259 CEToStringsMap::CEToStringsMap(UErrorCode &status) 260 : map(NULL) 261 { 262 if (U_FAILURE(status)) { 263 return; 264 } 265 266 map = uhash_open(uhash_hashLong, uhash_compareLong, 267 uhash_compareCaselessUnicodeString, 268 &status); 269 270 if (U_FAILURE(status)) { 271 return; 272 } 273 274 uhash_setValueDeleter(map, deleteStringList); 275 } 276 277 CEToStringsMap::~CEToStringsMap() 278 { 279 uhash_close(map); 280 } 281 282 void CEToStringsMap::put(uint32_t ce, UnicodeString *string, UErrorCode &status) 283 { 284 StringList *strings = getStringList(ce); 285 286 if (strings == NULL) { 287 strings = new StringList(status); 288 289 if (strings == NULL || U_FAILURE(status)) { 290 status = U_MEMORY_ALLOCATION_ERROR; 291 return; 292 } 293 294 putStringList(ce, strings, status); 295 } 296 297 strings->add(string, status); 298 } 299 300 StringList *CEToStringsMap::getStringList(uint32_t ce) const 301 { 302 return (StringList *) uhash_iget(map, ce); 303 } 304 305 void CEToStringsMap::putStringList(uint32_t ce, StringList *stringList, UErrorCode &status) 306 { 307 uhash_iput(map, ce, (void *) stringList, &status); 308 } 309 310 #define CLONE_COLLATOR 311 312 CollData::CollData(UCollator *collator, UErrorCode &status) 313 : coll(NULL), ceToCharsStartingWith(NULL) 314 { 315 // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]] 316 // i.e. other, control, private use, format, surrogate 317 U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20); 318 U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20); 319 USet *charsToTest = uset_openPattern(test_pattern, 20, &status); 320 321 // Han ext. A, Han, Jamo, Hangul, Han Ext. B 322 // i.e. all the characers we handle implicitly 323 U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); 324 U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); 325 USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status); 326 327 if (U_FAILURE(status)) { 328 return; 329 } 330 331 USet *expansions = uset_openEmpty(); 332 USet *contractions = uset_openEmpty(); 333 int32_t itemCount; 334 335 ceToCharsStartingWith = new CEToStringsMap(status); 336 337 if (U_FAILURE(status)) { 338 goto bail; 339 } 340 341 #ifdef CLONE_COLLATOR 342 coll = ucol_safeClone(collator, NULL, NULL, &status); 343 344 if (U_FAILURE(status)) { 345 goto bail; 346 } 347 #else 348 coll = collator; 349 #endif 350 351 ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); 352 353 uset_addAll(charsToTest, contractions); 354 uset_addAll(charsToTest, expansions); 355 uset_removeAll(charsToTest, charsToRemove); 356 357 itemCount = uset_getItemCount(charsToTest); 358 for(int32_t item = 0; item < itemCount; item += 1) { 359 UChar32 start = 0, end = 0; 360 UChar buffer[16]; 361 int32_t len = uset_getItem(charsToTest, item, &start, &end, 362 buffer, 16, &status); 363 364 if (len == 0) { 365 for (UChar32 ch = start; ch <= end; ch += 1) { 366 UnicodeString *st = new UnicodeString(ch); 367 368 if (st == NULL) { 369 status = U_MEMORY_ALLOCATION_ERROR; 370 break; 371 } 372 373 CEList *ceList = new CEList(coll, *st, status); 374 375 ceToCharsStartingWith->put(ceList->get(0), st, status); 376 377 delete ceList; 378 delete st; 379 } 380 } else if (len > 0) { 381 UnicodeString *st = new UnicodeString(buffer, len); 382 383 if (st == NULL) { 384 status = U_MEMORY_ALLOCATION_ERROR; 385 break; 386 } 387 388 CEList *ceList = new CEList(coll, *st, status); 389 390 ceToCharsStartingWith->put(ceList->get(0), st, status); 391 392 delete ceList; 393 delete st; 394 } else { 395 // shouldn't happen... 396 } 397 398 if (U_FAILURE(status)) { 399 break; 400 } 401 } 402 403 bail: 404 uset_close(contractions); 405 uset_close(expansions); 406 uset_close(charsToRemove); 407 uset_close(charsToTest); 408 409 if (U_FAILURE(status)) { 410 return; 411 } 412 413 UChar32 hanRanges[] = {UCOL_FIRST_HAN, UCOL_LAST_HAN, UCOL_FIRST_HAN_COMPAT, UCOL_LAST_HAN_COMPAT, UCOL_FIRST_HAN_A, UCOL_LAST_HAN_A, 414 UCOL_FIRST_HAN_B, UCOL_LAST_HAN_B}; 415 UChar jamoRanges[] = {UCOL_FIRST_L_JAMO, UCOL_FIRST_V_JAMO, UCOL_FIRST_T_JAMO, UCOL_LAST_T_JAMO}; 416 UnicodeString hanString = UnicodeString::fromUTF32(hanRanges, ARRAY_SIZE(hanRanges)); 417 UnicodeString jamoString(FALSE, jamoRanges, ARRAY_SIZE(jamoRanges)); 418 CEList hanList(coll, hanString, status); 419 CEList jamoList(coll, jamoString, status); 420 int32_t j = 0; 421 422 if (U_FAILURE(status)) { 423 return; 424 } 425 426 for (int32_t c = 0; c < jamoList.size(); c += 1) { 427 uint32_t jce = jamoList[c]; 428 429 if (! isContinuation(jce)) { 430 jamoLimits[j++] = jce; 431 } 432 } 433 434 jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT); 435 436 minHan = 0xFFFFFFFF; 437 maxHan = 0; 438 439 for(int32_t h = 0; h < hanList.size(); h += 2) { 440 uint32_t han = (uint32_t) hanList[h]; 441 442 if (han < minHan) { 443 minHan = han; 444 } 445 446 if (han > maxHan) { 447 maxHan = han; 448 } 449 } 450 451 maxHan += (1 << UCOL_PRIMARYORDERSHIFT); 452 } 453 454 CollData::~CollData() 455 { 456 #ifdef CLONE_COLLATOR 457 ucol_close(coll); 458 #endif 459 460 delete ceToCharsStartingWith; 461 } 462 463 UCollator *CollData::getCollator() const 464 { 465 return coll; 466 } 467 468 const StringList *CollData::getStringList(int32_t ce) const 469 { 470 return ceToCharsStartingWith->getStringList(ce); 471 } 472 473 const CEList *CollData::getCEList(const UnicodeString *string) const 474 { 475 UErrorCode status = U_ZERO_ERROR; 476 const CEList *list = new CEList(coll, *string, status); 477 478 if (U_FAILURE(status)) { 479 delete list; 480 list = NULL; 481 } 482 483 return list; 484 } 485 486 void CollData::freeCEList(const CEList *list) 487 { 488 delete list; 489 } 490 491 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset, int32_t *history) const 492 { 493 // find out shortest string for the longest sequence of ces. 494 // this can probably be folded with the minLengthCache... 495 496 if (history[offset] >= 0) { 497 return history[offset]; 498 } 499 500 uint32_t ce = ceList->get(offset); 501 int32_t maxOffset = ceList->size(); 502 int32_t shortestLength = INT32_MAX; 503 const StringList *strings = ceToCharsStartingWith->getStringList(ce); 504 505 if (strings != NULL) { 506 int32_t stringCount = strings->size(); 507 508 for (int32_t s = 0; s < stringCount; s += 1) { 509 const UnicodeString *string = strings->get(s); 510 UErrorCode status = U_ZERO_ERROR; 511 const CEList *ceList2 = new CEList(coll, *string, status); 512 513 if (U_FAILURE(status)) { 514 delete ceList2; 515 ceList2 = NULL; 516 } 517 518 if (ceList->matchesAt(offset, ceList2)) { 519 U_ASSERT(ceList2 != NULL); 520 int32_t clength = ceList2->size(); 521 int32_t slength = string->length(); 522 int32_t roffset = offset + clength; 523 int32_t rlength = 0; 524 525 if (roffset < maxOffset) { 526 rlength = minLengthInChars(ceList, roffset, history); 527 528 if (rlength <= 0) { 529 // delete before continue to avoid memory leak. 530 delete ceList2; 531 532 // ignore any dead ends 533 continue; 534 } 535 } 536 537 if (shortestLength > slength + rlength) { 538 shortestLength = slength + rlength; 539 } 540 } 541 542 delete ceList2; 543 } 544 } 545 546 if (shortestLength == INT32_MAX) { 547 // No matching strings at this offset. See if 548 // the CE is in a range we can handle manually. 549 if (ce >= minHan && ce < maxHan) { 550 // all han have implicit orders which 551 // generate two CEs. 552 int32_t roffset = offset + 2; 553 int32_t rlength = 0; 554 555 //history[roffset++] = -1; 556 //history[roffset++] = 1; 557 558 if (roffset < maxOffset) { 559 rlength = minLengthInChars(ceList, roffset, history); 560 } 561 562 if (rlength < 0) { 563 return -1; 564 } 565 566 shortestLength = 1 + rlength; 567 goto have_shortest; 568 } else if (ce >= jamoLimits[0] && ce < jamoLimits[3]) { 569 int32_t roffset = offset; 570 int32_t rlength = 0; 571 572 // **** this loop may not handle archaic Hangul correctly **** 573 for (int32_t j = 0; roffset < maxOffset && j < 4; j += 1, roffset += 1) { 574 uint32_t jce = ceList->get(roffset); 575 576 // Some Jamo have 24-bit primary order; skip the 577 // 2nd CE. This should always be OK because if 578 // we're still in the loop all we've seen are 579 // a series of Jamo in LVT order. 580 if (isContinuation(jce)) { 581 continue; 582 } 583 584 if (j >= 3 || jce < jamoLimits[j] || jce >= jamoLimits[j + 1]) { 585 break; 586 } 587 } 588 589 if (roffset == offset) { 590 // we started with a non-L Jamo... 591 // just say it comes from a single character 592 roffset += 1; 593 594 // See if the single Jamo has a 24-bit order. 595 if (roffset < maxOffset && isContinuation(ceList->get(roffset))) { 596 roffset += 1; 597 } 598 } 599 600 if (roffset < maxOffset) { 601 rlength = minLengthInChars(ceList, roffset, history); 602 } 603 604 if (rlength < 0) { 605 return -1; 606 } 607 608 shortestLength = 1 + rlength; 609 goto have_shortest; 610 } 611 612 // Can't handle it manually either. Just move on. 613 return -1; 614 } 615 616 have_shortest: 617 history[offset] = shortestLength; 618 619 return shortestLength; 620 } 621 622 int32_t CollData::minLengthInChars(const CEList *ceList, int32_t offset) const 623 { 624 int32_t clength = ceList->size(); 625 int32_t *history = NEW_ARRAY(int32_t, clength); 626 627 for (int32_t i = 0; i < clength; i += 1) { 628 history[i] = -1; 629 } 630 631 int32_t minLength = minLengthInChars(ceList, offset, history); 632 633 DELETE_ARRAY(history); 634 635 return minLength; 636 } 637 638 #endif // #if !UCONFIG_NO_COLLATION 639