1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2009-2014, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_COLLATION 13 14 #include "unicode/alphaindex.h" 15 #include "unicode/coll.h" 16 #include "unicode/localpointer.h" 17 #include "unicode/normalizer2.h" 18 #include "unicode/tblcoll.h" 19 #include "unicode/uchar.h" 20 #include "unicode/ulocdata.h" 21 #include "unicode/uniset.h" 22 #include "unicode/uobject.h" 23 #include "unicode/usetiter.h" 24 #include "unicode/utf16.h" 25 26 #include "cmemory.h" 27 #include "cstring.h" 28 #include "uassert.h" 29 #include "uvector.h" 30 #include "uvectr64.h" 31 32 //#include <string> 33 //#include <iostream> 34 35 U_NAMESPACE_BEGIN 36 37 namespace { 38 39 /** 40 * Prefix string for Chinese index buckets. 41 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes 42 */ 43 const UChar BASE[1] = { 0xFDD0 }; 44 const int32_t BASE_LENGTH = 1; 45 46 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, 47 const UnicodeString &one, const UnicodeString &other); 48 49 } // namespace 50 51 static int32_t U_CALLCONV 52 collatorComparator(const void *context, const void *left, const void *right); 53 54 static int32_t U_CALLCONV 55 recordCompareFn(const void *context, const void *left, const void *right); 56 57 // UVector<Record *> support function, delete a Record. 58 static void U_CALLCONV 59 alphaIndex_deleteRecord(void *obj) { 60 delete static_cast<AlphabeticIndex::Record *>(obj); 61 } 62 63 namespace { 64 65 UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned, 66 UErrorCode &errorCode) { 67 if (U_FAILURE(errorCode)) { return NULL; } 68 if (owned.isValid()) { 69 return owned.orphan(); 70 } 71 UnicodeString *p = new UnicodeString(s); 72 if (p == NULL) { 73 errorCode = U_MEMORY_ALLOCATION_ERROR; 74 } 75 return p; 76 } 77 78 inline UnicodeString *getString(const UVector &list, int32_t i) { 79 return static_cast<UnicodeString *>(list[i]); 80 } 81 82 inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) { 83 return static_cast<AlphabeticIndex::Bucket *>(list[i]); 84 } 85 86 inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) { 87 return static_cast<AlphabeticIndex::Record *>(list[i]); 88 } 89 90 /** 91 * Like Java Collections.binarySearch(List, String, Comparator). 92 * 93 * @return the index>=0 where the item was found, 94 * or the index<0 for inserting the string at ~index in sorted order 95 */ 96 int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) { 97 if (list.size() == 0) { return ~0; } 98 int32_t start = 0; 99 int32_t limit = list.size(); 100 for (;;) { 101 int32_t i = (start + limit) / 2; 102 const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i)); 103 UErrorCode errorCode = U_ZERO_ERROR; 104 UCollationResult cmp = coll.compare(s, *si, errorCode); 105 if (cmp == UCOL_EQUAL) { 106 return i; 107 } else if (cmp < 0) { 108 if (i == start) { 109 return ~start; // insert s before *si 110 } 111 limit = i; 112 } else { 113 if (i == start) { 114 return ~(start + 1); // insert s after *si 115 } 116 start = i; 117 } 118 } 119 } 120 121 } // namespace 122 123 // The BucketList is not in the anonymous namespace because only Clang 124 // seems to support its use in other classes from there. 125 // However, we also don't need U_I18N_API because it is not used from outside the i18n library. 126 class BucketList : public UObject { 127 public: 128 BucketList(UVector *bucketList, UVector *publicBucketList) 129 : bucketList_(bucketList), immutableVisibleList_(publicBucketList) { 130 int32_t displayIndex = 0; 131 for (int32_t i = 0; i < publicBucketList->size(); ++i) { 132 getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++; 133 } 134 } 135 136 // The virtual destructor must not be inline. 137 // See ticket #8454 for details. 138 virtual ~BucketList(); 139 140 int32_t getBucketCount() const { 141 return immutableVisibleList_->size(); 142 } 143 144 int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly, 145 UErrorCode &errorCode) { 146 // binary search 147 int32_t start = 0; 148 int32_t limit = bucketList_->size(); 149 while ((start + 1) < limit) { 150 int32_t i = (start + limit) / 2; 151 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i); 152 UCollationResult nameVsBucket = 153 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode); 154 if (nameVsBucket < 0) { 155 limit = i; 156 } else { 157 start = i; 158 } 159 } 160 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start); 161 if (bucket->displayBucket_ != NULL) { 162 bucket = bucket->displayBucket_; 163 } 164 return bucket->displayIndex_; 165 } 166 167 /** All of the buckets, visible and invisible. */ 168 UVector *bucketList_; 169 /** Just the visible buckets. */ 170 UVector *immutableVisibleList_; 171 }; 172 173 BucketList::~BucketList() { 174 delete bucketList_; 175 if (immutableVisibleList_ != bucketList_) { 176 delete immutableVisibleList_; 177 } 178 } 179 180 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() { 181 delete buckets_; 182 delete collatorPrimaryOnly_; 183 } 184 185 int32_t 186 AlphabeticIndex::ImmutableIndex::getBucketCount() const { 187 return buckets_->getBucketCount(); 188 } 189 190 int32_t 191 AlphabeticIndex::ImmutableIndex::getBucketIndex( 192 const UnicodeString &name, UErrorCode &errorCode) const { 193 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode); 194 } 195 196 const AlphabeticIndex::Bucket * 197 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const { 198 if (0 <= index && index < buckets_->getBucketCount()) { 199 return icu::getBucket(*buckets_->immutableVisibleList_, index); 200 } else { 201 return NULL; 202 } 203 } 204 205 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) 206 : inputList_(NULL), 207 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), 208 maxLabelCount_(99), 209 initialLabels_(NULL), firstCharsInScripts_(NULL), 210 collator_(NULL), collatorPrimaryOnly_(NULL), 211 buckets_(NULL) { 212 init(&locale, status); 213 } 214 215 216 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status) 217 : inputList_(NULL), 218 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), 219 maxLabelCount_(99), 220 initialLabels_(NULL), firstCharsInScripts_(NULL), 221 collator_(collator), collatorPrimaryOnly_(NULL), 222 buckets_(NULL) { 223 init(NULL, status); 224 } 225 226 227 228 AlphabeticIndex::~AlphabeticIndex() { 229 delete collator_; 230 delete collatorPrimaryOnly_; 231 delete firstCharsInScripts_; 232 delete buckets_; 233 delete inputList_; 234 delete initialLabels_; 235 } 236 237 238 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { 239 if (U_FAILURE(status)) { 240 return *this; 241 } 242 initialLabels_->addAll(additions); 243 clearBuckets(); 244 return *this; 245 } 246 247 248 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { 249 addIndexExemplars(locale, status); 250 clearBuckets(); 251 return *this; 252 } 253 254 255 AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) { 256 if (U_FAILURE(errorCode)) { return NULL; } 257 // In C++, the ImmutableIndex must own its copy of the BucketList, 258 // even if it contains no records, for proper memory management. 259 // We could clone the buckets_ if they are not NULL, 260 // but that would be worth it only if this method is called multiple times, 261 // or called after using the old-style bucket iterator API. 262 LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode)); 263 LocalPointer<RuleBasedCollator> coll( 264 static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone())); 265 if (immutableBucketList.isNull() || coll.isNull()) { 266 errorCode = U_MEMORY_ALLOCATION_ERROR; 267 return NULL; 268 } 269 ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias()); 270 if (immIndex == NULL) { 271 errorCode = U_MEMORY_ALLOCATION_ERROR; 272 return NULL; 273 } 274 // The ImmutableIndex adopted its parameter objects. 275 immutableBucketList.orphan(); 276 coll.orphan(); 277 return immIndex; 278 } 279 280 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { 281 initBuckets(status); 282 if (U_FAILURE(status)) { 283 return 0; 284 } 285 return buckets_->getBucketCount(); 286 } 287 288 289 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { 290 if (U_FAILURE(status) || inputList_ == NULL) { 291 return 0; 292 } 293 return inputList_->size(); 294 } 295 296 void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const { 297 const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode); 298 if (U_FAILURE(errorCode)) { return; } 299 300 const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0); 301 const UnicodeString &overflowBoundary = 302 *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1); 303 304 // We make a sorted array of elements. 305 // Some of the input may be redundant. 306 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". 307 // We filter out those cases. 308 UnicodeSetIterator iter(*initialLabels_); 309 while (iter.next()) { 310 const UnicodeString *item = &iter.getString(); 311 LocalPointer<UnicodeString> ownedItem; 312 UBool checkDistinct; 313 int32_t itemLength = item->length(); 314 if (!item->hasMoreChar32Than(0, itemLength, 1)) { 315 checkDistinct = FALSE; 316 } else if(item->charAt(itemLength - 1) == 0x2a && // '*' 317 item->charAt(itemLength - 2) != 0x2a) { 318 // Use a label if it is marked with one trailing star, 319 // even if the label string sorts the same when all contractions are suppressed. 320 ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1)); 321 item = ownedItem.getAlias(); 322 if (item == NULL) { 323 errorCode = U_MEMORY_ALLOCATION_ERROR; 324 return; 325 } 326 checkDistinct = FALSE; 327 } else { 328 checkDistinct = TRUE; 329 } 330 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { 331 // Ignore a primary-ignorable or non-alphabetic index character. 332 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { 333 // Ignore an index character that will land in the overflow bucket. 334 } else if (checkDistinct && 335 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { 336 // Ignore a multi-code point index character that does not sort distinctly 337 // from the sequence of its separate characters. 338 } else { 339 int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_); 340 if (insertionPoint < 0) { 341 indexCharacters.insertElementAt( 342 ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode); 343 } else { 344 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); 345 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) { 346 indexCharacters.setElementAt( 347 ownedString(*item, ownedItem, errorCode), insertionPoint); 348 } 349 } 350 } 351 } 352 if (U_FAILURE(errorCode)) { return; } 353 354 // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element 355 356 int32_t size = indexCharacters.size() - 1; 357 if (size > maxLabelCount_) { 358 int32_t count = 0; 359 int32_t old = -1; 360 for (int32_t i = 0; i < indexCharacters.size();) { 361 ++count; 362 int32_t bump = count * maxLabelCount_ / size; 363 if (bump == old) { 364 indexCharacters.removeElementAt(i); 365 } else { 366 old = bump; 367 ++i; 368 } 369 } 370 } 371 } 372 373 namespace { 374 375 const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) { 376 if (!current.startsWith(BASE, BASE_LENGTH)) { 377 return current; 378 } 379 UChar rest = current.charAt(BASE_LENGTH); 380 if (0x2800 < rest && rest <= 0x28FF) { // stroke count 381 int32_t count = rest-0x2800; 382 temp.setTo((UChar)(0x30 + count % 10)); 383 if (count >= 10) { 384 count /= 10; 385 temp.insert(0, (UChar)(0x30 + count % 10)); 386 if (count >= 10) { 387 count /= 10; 388 temp.insert(0, (UChar)(0x30 + count)); 389 } 390 } 391 return temp.append((UChar)0x5283); 392 } 393 return temp.setTo(current, BASE_LENGTH); 394 } 395 396 UBool hasMultiplePrimaryWeights( 397 const RuleBasedCollator &coll, uint32_t variableTop, 398 const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) { 399 ces.removeAllElements(); 400 coll.internalGetCEs(s, ces, errorCode); 401 if (U_FAILURE(errorCode)) { return FALSE; } 402 UBool seenPrimary = FALSE; 403 for (int32_t i = 0; i < ces.size(); ++i) { 404 int64_t ce = ces.elementAti(i); 405 uint32_t p = (uint32_t)(ce >> 32); 406 if (p > variableTop) { 407 // not primary ignorable 408 if (seenPrimary) { 409 return TRUE; 410 } 411 seenPrimary = TRUE; 412 } 413 } 414 return FALSE; 415 } 416 417 } // namespace 418 419 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { 420 // Initialize indexCharacters. 421 UVector indexCharacters(errorCode); 422 indexCharacters.setDeleter(uprv_deleteUObject); 423 initLabels(indexCharacters, errorCode); 424 if (U_FAILURE(errorCode)) { return NULL; } 425 426 // Variables for hasMultiplePrimaryWeights(). 427 UVector64 ces(errorCode); 428 uint32_t variableTop; 429 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) { 430 variableTop = collatorPrimaryOnly_->getVariableTop(errorCode); 431 } else { 432 variableTop = 0; 433 } 434 UBool hasInvisibleBuckets = FALSE; 435 436 // Helper arrays for Chinese Pinyin collation. 437 Bucket *asciiBuckets[26] = { 438 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 439 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL 440 }; 441 Bucket *pinyinBuckets[26] = { 442 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 443 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL 444 }; 445 UBool hasPinyin = FALSE; 446 447 LocalPointer<UVector> bucketList(new UVector(errorCode), errorCode); 448 if (U_FAILURE(errorCode)) { 449 return NULL; 450 } 451 bucketList->setDeleter(uprv_deleteUObject); 452 453 // underflow bucket 454 Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW); 455 if (bucket == NULL) { 456 errorCode = U_MEMORY_ALLOCATION_ERROR; 457 return NULL; 458 } 459 bucketList->addElement(bucket, errorCode); 460 if (U_FAILURE(errorCode)) { return NULL; } 461 462 UnicodeString temp; 463 464 // fix up the list, adding underflow, additions, overflow 465 // Insert inflow labels as needed. 466 int32_t scriptIndex = -1; 467 const UnicodeString *scriptUpperBoundary = &emptyString_; 468 for (int32_t i = 0; i < indexCharacters.size(); ++i) { 469 UnicodeString ¤t = *getString(indexCharacters, i); 470 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) { 471 // We crossed the script boundary into a new script. 472 const UnicodeString &inflowBoundary = *scriptUpperBoundary; 473 UBool skippedScript = FALSE; 474 for (;;) { 475 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex); 476 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) { 477 break; 478 } 479 skippedScript = TRUE; 480 } 481 if (skippedScript && bucketList->size() > 1) { 482 // We are skipping one or more scripts, 483 // and we are not just getting out of the underflow label. 484 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW); 485 if (bucket == NULL) { 486 errorCode = U_MEMORY_ALLOCATION_ERROR; 487 return NULL; 488 } 489 bucketList->addElement(bucket, errorCode); 490 } 491 } 492 // Add a bucket with the current label. 493 bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL); 494 if (bucket == NULL) { 495 errorCode = U_MEMORY_ALLOCATION_ERROR; 496 return NULL; 497 } 498 bucketList->addElement(bucket, errorCode); 499 // Remember ASCII and Pinyin buckets for Pinyin redirects. 500 UChar c; 501 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z 502 asciiBuckets[c - 0x41] = bucket; 503 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) && 504 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { 505 pinyinBuckets[c - 0x41] = bucket; 506 hasPinyin = TRUE; 507 } 508 // Check for multiple primary weights. 509 if (!current.startsWith(BASE, BASE_LENGTH) && 510 hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current, 511 ces, errorCode) && 512 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) { 513 // "AE-ligature" or "Sch" etc. 514 for (int32_t i = bucketList->size() - 2;; --i) { 515 Bucket *singleBucket = getBucket(*bucketList, i); 516 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 517 // There is no single-character bucket since the last 518 // underflow or inflow label. 519 break; 520 } 521 if (singleBucket->displayBucket_ == NULL && 522 !hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, 523 singleBucket->lowerBoundary_, 524 ces, errorCode)) { 525 // Add an invisible bucket that redirects strings greater than the expansion 526 // to the previous single-character bucket. 527 // For example, after ... Q R S Sch we add Sch\uFFFF->S 528 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. 529 bucket = new Bucket(emptyString_, 530 UnicodeString(current).append((UChar)0xFFFF), 531 U_ALPHAINDEX_NORMAL); 532 if (bucket == NULL) { 533 errorCode = U_MEMORY_ALLOCATION_ERROR; 534 return NULL; 535 } 536 bucket->displayBucket_ = singleBucket; 537 bucketList->addElement(bucket, errorCode); 538 hasInvisibleBuckets = TRUE; 539 break; 540 } 541 } 542 } 543 } 544 if (U_FAILURE(errorCode)) { return NULL; } 545 if (bucketList->size() == 1) { 546 // No real labels, show only the underflow label. 547 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); 548 if (bl == NULL) { 549 errorCode = U_MEMORY_ALLOCATION_ERROR; 550 return NULL; 551 } 552 bucketList.orphan(); 553 return bl; 554 } 555 // overflow bucket 556 bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW); 557 if (bucket == NULL) { 558 errorCode = U_MEMORY_ALLOCATION_ERROR; 559 return NULL; 560 } 561 bucketList->addElement(bucket, errorCode); // final 562 563 if (hasPinyin) { 564 // Redirect Pinyin buckets. 565 Bucket *asciiBucket = NULL; 566 for (int32_t i = 0; i < 26; ++i) { 567 if (asciiBuckets[i] != NULL) { 568 asciiBucket = asciiBuckets[i]; 569 } 570 if (pinyinBuckets[i] != NULL && asciiBucket != NULL) { 571 pinyinBuckets[i]->displayBucket_ = asciiBucket; 572 hasInvisibleBuckets = TRUE; 573 } 574 } 575 } 576 577 if (U_FAILURE(errorCode)) { return NULL; } 578 if (!hasInvisibleBuckets) { 579 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); 580 if (bl == NULL) { 581 errorCode = U_MEMORY_ALLOCATION_ERROR; 582 return NULL; 583 } 584 bucketList.orphan(); 585 return bl; 586 } 587 // Merge inflow buckets that are visually adjacent. 588 // Iterate backwards: Merge inflow into overflow rather than the other way around. 589 int32_t i = bucketList->size() - 1; 590 Bucket *nextBucket = getBucket(*bucketList, i); 591 while (--i > 0) { 592 bucket = getBucket(*bucketList, i); 593 if (bucket->displayBucket_ != NULL) { 594 continue; // skip invisible buckets 595 } 596 if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) { 597 if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 598 bucket->displayBucket_ = nextBucket; 599 continue; 600 } 601 } 602 nextBucket = bucket; 603 } 604 605 LocalPointer<UVector> publicBucketList(new UVector(errorCode), errorCode); 606 if (U_FAILURE(errorCode)) { 607 return NULL; 608 } 609 // Do not call publicBucketList->setDeleter(): 610 // This vector shares its objects with the bucketList. 611 for (int32_t i = 0; i < bucketList->size(); ++i) { 612 bucket = getBucket(*bucketList, i); 613 if (bucket->displayBucket_ == NULL) { 614 publicBucketList->addElement(bucket, errorCode); 615 } 616 } 617 if (U_FAILURE(errorCode)) { return NULL; } 618 BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias()); 619 if (bl == NULL) { 620 errorCode = U_MEMORY_ALLOCATION_ERROR; 621 return NULL; 622 } 623 bucketList.orphan(); 624 publicBucketList.orphan(); 625 return bl; 626 } 627 628 /** 629 * Creates an index, and buckets and sorts the list of records into the index. 630 */ 631 void AlphabeticIndex::initBuckets(UErrorCode &errorCode) { 632 if (U_FAILURE(errorCode) || buckets_ != NULL) { 633 return; 634 } 635 buckets_ = createBucketList(errorCode); 636 if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) { 637 return; 638 } 639 640 // Sort the records by name. 641 // Stable sort preserves input order of collation duplicates. 642 inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode); 643 644 // Now, we traverse all of the input, which is now sorted. 645 // If the item doesn't go in the current bucket, we find the next bucket that contains it. 646 // This makes the process order n*log(n), since we just sort the list and then do a linear process. 647 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so 648 // we need to improve it for that case. 649 650 Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0); 651 int32_t bucketIndex = 1; 652 Bucket *nextBucket; 653 const UnicodeString *upperBoundary; 654 if (bucketIndex < buckets_->bucketList_->size()) { 655 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); 656 upperBoundary = &nextBucket->lowerBoundary_; 657 } else { 658 nextBucket = NULL; 659 upperBoundary = NULL; 660 } 661 for (int32_t i = 0; i < inputList_->size(); ++i) { 662 Record *r = getRecord(*inputList_, i); 663 // if the current bucket isn't the right one, find the one that is 664 // We have a special flag for the last bucket so that we don't look any further 665 while (upperBoundary != NULL && 666 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) { 667 currentBucket = nextBucket; 668 // now reset the boundary that we compare against 669 if (bucketIndex < buckets_->bucketList_->size()) { 670 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); 671 upperBoundary = &nextBucket->lowerBoundary_; 672 } else { 673 upperBoundary = NULL; 674 } 675 } 676 // now put the record into the bucket. 677 Bucket *bucket = currentBucket; 678 if (bucket->displayBucket_ != NULL) { 679 bucket = bucket->displayBucket_; 680 } 681 if (bucket->records_ == NULL) { 682 bucket->records_ = new UVector(errorCode); 683 if (bucket->records_ == NULL) { 684 errorCode = U_MEMORY_ALLOCATION_ERROR; 685 return; 686 } 687 } 688 bucket->records_->addElement(r, errorCode); 689 } 690 } 691 692 void AlphabeticIndex::clearBuckets() { 693 if (buckets_ != NULL) { 694 delete buckets_; 695 buckets_ = NULL; 696 internalResetBucketIterator(); 697 } 698 } 699 700 void AlphabeticIndex::internalResetBucketIterator() { 701 labelsIterIndex_ = -1; 702 currentBucket_ = NULL; 703 } 704 705 706 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { 707 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); 708 if (U_FAILURE(status)) { 709 return; 710 } 711 712 UnicodeSet exemplars; 713 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); 714 if (U_SUCCESS(status)) { 715 initialLabels_->addAll(exemplars); 716 return; 717 } 718 status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR 719 720 // The locale data did not include explicit Index characters. 721 // Synthesize a set of them from the locale's standard exemplar characters. 722 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); 723 if (U_FAILURE(status)) { 724 return; 725 } 726 727 // question: should we add auxiliary exemplars? 728 if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) { 729 exemplars.add(0x61, 0x7A); 730 } 731 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables 732 // cut down to small list 733 exemplars.remove(0xAC00, 0xD7A3). 734 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). 735 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). 736 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). 737 add(0xD30C).add(0xD558); 738 } 739 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block 740 // cut down to small list 741 // make use of the fact that Ethiopic is allocated in 8's, where 742 // the base is 0 mod 8. 743 UnicodeSet ethiopic( 744 UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status); 745 UnicodeSetIterator it(ethiopic); 746 while (it.next() && !it.isString()) { 747 if ((it.getCodepoint() & 0x7) != 0) { 748 exemplars.remove(it.getCodepoint()); 749 } 750 } 751 } 752 753 // Upper-case any that aren't already so. 754 // (We only do this for synthesized index characters.) 755 UnicodeSetIterator it(exemplars); 756 UnicodeString upperC; 757 while (it.next()) { 758 const UnicodeString &exemplarC = it.getString(); 759 upperC = exemplarC; 760 upperC.toUpper(locale); 761 initialLabels_->add(upperC); 762 } 763 } 764 765 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { 766 UnicodeSet contractions; 767 collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode); 768 if (U_FAILURE(errorCode) || contractions.isEmpty()) { return FALSE; } 769 initialLabels_->addAll(contractions); 770 UnicodeSetIterator iter(contractions); 771 while (iter.next()) { 772 const UnicodeString &s = iter.getString(); 773 U_ASSERT (s.startsWith(BASE, BASE_LENGTH)); 774 UChar c = s.charAt(s.length() - 1); 775 if (0x41 <= c && c <= 0x5A) { // A-Z 776 // There are Pinyin labels, add ASCII A-Z labels as well. 777 initialLabels_->add(0x41, 0x5A); // A-Z 778 break; 779 } 780 } 781 return TRUE; 782 } 783 784 785 /* 786 * Return the string with interspersed CGJs. Input must have more than 2 codepoints. 787 */ 788 static const UChar CGJ = 0x034F; 789 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { 790 UnicodeString result; 791 if (item.length() == 0) { 792 return result; 793 } 794 int32_t i = 0; 795 for (;;) { 796 UChar32 cp = item.char32At(i); 797 result.append(cp); 798 i = item.moveIndex32(i, 1); 799 if (i >= item.length()) { 800 break; 801 } 802 result.append(CGJ); 803 } 804 return result; 805 } 806 807 808 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { 809 return FALSE; 810 } 811 812 813 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { 814 return FALSE; 815 } 816 817 818 const RuleBasedCollator &AlphabeticIndex::getCollator() const { 819 return *collator_; 820 } 821 822 823 const UnicodeString &AlphabeticIndex::getInflowLabel() const { 824 return inflowLabel_; 825 } 826 827 const UnicodeString &AlphabeticIndex::getOverflowLabel() const { 828 return overflowLabel_; 829 } 830 831 832 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { 833 return underflowLabel_; 834 } 835 836 837 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 838 inflowLabel_ = label; 839 clearBuckets(); 840 return *this; 841 } 842 843 844 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 845 overflowLabel_ = label; 846 clearBuckets(); 847 return *this; 848 } 849 850 851 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 852 underflowLabel_ = label; 853 clearBuckets(); 854 return *this; 855 } 856 857 858 int32_t AlphabeticIndex::getMaxLabelCount() const { 859 return maxLabelCount_; 860 } 861 862 863 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { 864 if (U_FAILURE(status)) { 865 return *this; 866 } 867 if (maxLabelCount <= 0) { 868 status = U_ILLEGAL_ARGUMENT_ERROR; 869 return *this; 870 } 871 maxLabelCount_ = maxLabelCount; 872 clearBuckets(); 873 return *this; 874 } 875 876 877 // 878 // init() - Common code for constructors. 879 // 880 881 void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { 882 if (U_FAILURE(status)) { return; } 883 if (locale == NULL && collator_ == NULL) { 884 status = U_ILLEGAL_ARGUMENT_ERROR; 885 return; 886 } 887 888 initialLabels_ = new UnicodeSet(); 889 if (initialLabels_ == NULL) { 890 status = U_MEMORY_ALLOCATION_ERROR; 891 return; 892 } 893 894 inflowLabel_.setTo((UChar)0x2026); // Ellipsis 895 overflowLabel_ = inflowLabel_; 896 underflowLabel_ = inflowLabel_; 897 898 if (collator_ == NULL) { 899 Collator *coll = Collator::createInstance(*locale, status); 900 if (U_FAILURE(status)) { 901 delete coll; 902 return; 903 } 904 if (coll == NULL) { 905 status = U_MEMORY_ALLOCATION_ERROR; 906 return; 907 } 908 collator_ = dynamic_cast<RuleBasedCollator *>(coll); 909 if (collator_ == NULL) { 910 delete coll; 911 status = U_UNSUPPORTED_ERROR; 912 return; 913 } 914 } 915 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone()); 916 if (collatorPrimaryOnly_ == NULL) { 917 status = U_MEMORY_ALLOCATION_ERROR; 918 return; 919 } 920 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); 921 firstCharsInScripts_ = firstStringsInScript(status); 922 if (U_FAILURE(status)) { return; } 923 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); 924 // Guard against a degenerate collator where 925 // some script boundary strings are primary ignorable. 926 for (;;) { 927 if (U_FAILURE(status)) { return; } 928 if (firstCharsInScripts_->isEmpty()) { 929 // AlphabeticIndex requires some non-ignorable script boundary strings. 930 status = U_ILLEGAL_ARGUMENT_ERROR; 931 return; 932 } 933 if (collatorPrimaryOnly_->compare( 934 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)), 935 emptyString_, status) == UCOL_EQUAL) { 936 firstCharsInScripts_->removeElementAt(0); 937 } else { 938 break; 939 } 940 } 941 942 // Chinese index characters, which are specific to each of the several Chinese tailorings, 943 // take precedence over the single locale data exemplar set per language. 944 if (!addChineseIndexCharacters(status) && locale != NULL) { 945 addIndexExemplars(*locale, status); 946 } 947 } 948 949 950 // 951 // Comparison function for UVector<UnicodeString *> sorting with a collator. 952 // 953 static int32_t U_CALLCONV 954 collatorComparator(const void *context, const void *left, const void *right) { 955 const UElement *leftElement = static_cast<const UElement *>(left); 956 const UElement *rightElement = static_cast<const UElement *>(right); 957 const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer); 958 const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer); 959 960 if (leftString == rightString) { 961 // Catches case where both are NULL 962 return 0; 963 } 964 if (leftString == NULL) { 965 return 1; 966 }; 967 if (rightString == NULL) { 968 return -1; 969 } 970 const Collator *col = static_cast<const Collator *>(context); 971 UErrorCode errorCode = U_ZERO_ERROR; 972 return col->compare(*leftString, *rightString, errorCode); 973 } 974 975 // 976 // Comparison function for UVector<Record *> sorting with a collator. 977 // 978 static int32_t U_CALLCONV 979 recordCompareFn(const void *context, const void *left, const void *right) { 980 const UElement *leftElement = static_cast<const UElement *>(left); 981 const UElement *rightElement = static_cast<const UElement *>(right); 982 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer); 983 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer); 984 const Collator *col = static_cast<const Collator *>(context); 985 UErrorCode errorCode = U_ZERO_ERROR; 986 return col->compare(leftRec->name_, rightRec->name_, errorCode); 987 } 988 989 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { 990 if (U_FAILURE(status)) { 991 return NULL; 992 } 993 LocalPointer<UVector> dest(new UVector(status), status); 994 if (U_FAILURE(status)) { 995 return NULL; 996 } 997 dest->setDeleter(uprv_deleteUObject); 998 // Fetch the script-first-primary contractions which are defined in the root collator. 999 // They all start with U+FDD1. 1000 UnicodeSet set; 1001 collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status); 1002 if (U_FAILURE(status)) { 1003 return NULL; 1004 } 1005 if (set.isEmpty()) { 1006 status = U_UNSUPPORTED_ERROR; 1007 return NULL; 1008 } 1009 UnicodeSetIterator iter(set); 1010 while (iter.next()) { 1011 const UnicodeString &boundary = iter.getString(); 1012 uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1)); 1013 if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) { 1014 // Ignore boundaries for the special reordering groups. 1015 // Take only those for "real scripts" (where the sample character is a Letter, 1016 // and the one for unassigned implicit weights (Cn). 1017 continue; 1018 } 1019 UnicodeString *s = new UnicodeString(boundary); 1020 if (s == NULL) { 1021 status = U_MEMORY_ALLOCATION_ERROR; 1022 return NULL; 1023 } 1024 dest->addElement(s, status); 1025 } 1026 return dest.orphan(); 1027 } 1028 1029 1030 namespace { 1031 1032 /** 1033 * Returns true if one index character string is "better" than the other. 1034 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is 1035 * better, and otherwise binary-less-than is better. 1036 */ 1037 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, 1038 const UnicodeString &one, const UnicodeString &other) { 1039 // This is called with primary-equal strings, but never with one.equals(other). 1040 UErrorCode status = U_ZERO_ERROR; 1041 UnicodeString n1 = nfkdNormalizer.normalize(one, status); 1042 UnicodeString n2 = nfkdNormalizer.normalize(other, status); 1043 if (U_FAILURE(status)) { return FALSE; } 1044 int32_t result = n1.countChar32() - n2.countChar32(); 1045 if (result != 0) { 1046 return result < 0; 1047 } 1048 result = n1.compareCodePointOrder(n2); 1049 if (result != 0) { 1050 return result < 0; 1051 } 1052 return one.compareCodePointOrder(other) < 0; 1053 } 1054 1055 } // namespace 1056 1057 // 1058 // Constructor & Destructor for AlphabeticIndex::Record 1059 // 1060 // Records are internal only, instances are not directly surfaced in the public API. 1061 // This class is mostly struct-like, with all public fields. 1062 1063 AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data) 1064 : name_(name), data_(data) {} 1065 1066 AlphabeticIndex::Record::~Record() { 1067 } 1068 1069 1070 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { 1071 if (U_FAILURE(status)) { 1072 return *this; 1073 } 1074 if (inputList_ == NULL) { 1075 inputList_ = new UVector(status); 1076 if (inputList_ == NULL) { 1077 status = U_MEMORY_ALLOCATION_ERROR; 1078 return *this; 1079 } 1080 inputList_->setDeleter(alphaIndex_deleteRecord); 1081 } 1082 Record *r = new Record(name, data); 1083 if (r == NULL) { 1084 status = U_MEMORY_ALLOCATION_ERROR; 1085 return *this; 1086 } 1087 inputList_->addElement(r, status); 1088 clearBuckets(); 1089 //std::string ss; 1090 //std::string ss2; 1091 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << 1092 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; 1093 return *this; 1094 } 1095 1096 1097 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { 1098 if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) { 1099 inputList_->removeAllElements(); 1100 clearBuckets(); 1101 } 1102 return *this; 1103 } 1104 1105 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { 1106 initBuckets(status); 1107 if (U_FAILURE(status)) { 1108 return 0; 1109 } 1110 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status); 1111 } 1112 1113 1114 int32_t AlphabeticIndex::getBucketIndex() const { 1115 return labelsIterIndex_; 1116 } 1117 1118 1119 UBool AlphabeticIndex::nextBucket(UErrorCode &status) { 1120 if (U_FAILURE(status)) { 1121 return FALSE; 1122 } 1123 if (buckets_ == NULL && currentBucket_ != NULL) { 1124 status = U_ENUM_OUT_OF_SYNC_ERROR; 1125 return FALSE; 1126 } 1127 initBuckets(status); 1128 if (U_FAILURE(status)) { 1129 return FALSE; 1130 } 1131 ++labelsIterIndex_; 1132 if (labelsIterIndex_ >= buckets_->getBucketCount()) { 1133 labelsIterIndex_ = buckets_->getBucketCount(); 1134 return FALSE; 1135 } 1136 currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_); 1137 resetRecordIterator(); 1138 return TRUE; 1139 } 1140 1141 const UnicodeString &AlphabeticIndex::getBucketLabel() const { 1142 if (currentBucket_ != NULL) { 1143 return currentBucket_->label_; 1144 } else { 1145 return emptyString_; 1146 } 1147 } 1148 1149 1150 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { 1151 if (currentBucket_ != NULL) { 1152 return currentBucket_->labelType_; 1153 } else { 1154 return U_ALPHAINDEX_NORMAL; 1155 } 1156 } 1157 1158 1159 int32_t AlphabeticIndex::getBucketRecordCount() const { 1160 if (currentBucket_ != NULL && currentBucket_->records_ != NULL) { 1161 return currentBucket_->records_->size(); 1162 } else { 1163 return 0; 1164 } 1165 } 1166 1167 1168 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { 1169 if (U_FAILURE(status)) { 1170 return *this; 1171 } 1172 internalResetBucketIterator(); 1173 return *this; 1174 } 1175 1176 1177 UBool AlphabeticIndex::nextRecord(UErrorCode &status) { 1178 if (U_FAILURE(status)) { 1179 return FALSE; 1180 } 1181 if (currentBucket_ == NULL) { 1182 // We are trying to iterate over the items in a bucket, but there is no 1183 // current bucket from the enumeration of buckets. 1184 status = U_INVALID_STATE_ERROR; 1185 return FALSE; 1186 } 1187 if (buckets_ == NULL) { 1188 status = U_ENUM_OUT_OF_SYNC_ERROR; 1189 return FALSE; 1190 } 1191 if (currentBucket_->records_ == NULL) { 1192 return FALSE; 1193 } 1194 ++itemsIterIndex_; 1195 if (itemsIterIndex_ >= currentBucket_->records_->size()) { 1196 itemsIterIndex_ = currentBucket_->records_->size(); 1197 return FALSE; 1198 } 1199 return TRUE; 1200 } 1201 1202 1203 const UnicodeString &AlphabeticIndex::getRecordName() const { 1204 const UnicodeString *retStr = &emptyString_; 1205 if (currentBucket_ != NULL && currentBucket_->records_ != NULL && 1206 itemsIterIndex_ >= 0 && 1207 itemsIterIndex_ < currentBucket_->records_->size()) { 1208 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); 1209 retStr = &item->name_; 1210 } 1211 return *retStr; 1212 } 1213 1214 const void *AlphabeticIndex::getRecordData() const { 1215 const void *retPtr = NULL; 1216 if (currentBucket_ != NULL && currentBucket_->records_ != NULL && 1217 itemsIterIndex_ >= 0 && 1218 itemsIterIndex_ < currentBucket_->records_->size()) { 1219 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); 1220 retPtr = item->data_; 1221 } 1222 return retPtr; 1223 } 1224 1225 1226 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { 1227 itemsIterIndex_ = -1; 1228 return *this; 1229 } 1230 1231 1232 1233 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, 1234 const UnicodeString &lowerBoundary, 1235 UAlphabeticIndexLabelType type) 1236 : label_(label), lowerBoundary_(lowerBoundary), labelType_(type), 1237 displayBucket_(NULL), displayIndex_(-1), 1238 records_(NULL) { 1239 } 1240 1241 1242 AlphabeticIndex::Bucket::~Bucket() { 1243 delete records_; 1244 } 1245 1246 U_NAMESPACE_END 1247 1248 #endif // !UCONFIG_NO_COLLATION 1249