1 /* 2 ******************************************************************************* 3 * Copyright (C) 2009-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_COLLATION 11 12 #include "unicode/alphaindex.h" 13 #include "unicode/coll.h" 14 #include "unicode/localpointer.h" 15 #include "unicode/normalizer2.h" 16 #include "unicode/tblcoll.h" 17 #include "unicode/uchar.h" 18 #include "unicode/ulocdata.h" 19 #include "unicode/uniset.h" 20 #include "unicode/uobject.h" 21 #include "unicode/usetiter.h" 22 #include "unicode/utf16.h" 23 24 #include "cmemory.h" 25 #include "cstring.h" 26 #include "uassert.h" 27 #include "uvector.h" 28 #include "uvectr64.h" 29 30 //#include <string> 31 //#include <iostream> 32 33 U_NAMESPACE_BEGIN 34 35 namespace { 36 37 /** 38 * Prefix string for Chinese index buckets. 39 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes 40 */ 41 const UChar BASE[1] = { 0xFDD0 }; 42 const int32_t BASE_LENGTH = 1; 43 44 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, 45 const UnicodeString &one, const UnicodeString &other); 46 47 } // namespace 48 49 static int32_t U_CALLCONV 50 collatorComparator(const void *context, const void *left, const void *right); 51 52 static int32_t U_CALLCONV 53 recordCompareFn(const void *context, const void *left, const void *right); 54 55 // UVector<Record *> support function, delete a Record. 56 static void U_CALLCONV 57 alphaIndex_deleteRecord(void *obj) { 58 delete static_cast<AlphabeticIndex::Record *>(obj); 59 } 60 61 namespace { 62 63 UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned, 64 UErrorCode &errorCode) { 65 if (U_FAILURE(errorCode)) { return NULL; } 66 if (owned.isValid()) { 67 return owned.orphan(); 68 } 69 UnicodeString *p = new UnicodeString(s); 70 if (p == NULL) { 71 errorCode = U_MEMORY_ALLOCATION_ERROR; 72 } 73 return p; 74 } 75 76 inline UnicodeString *getString(const UVector &list, int32_t i) { 77 return static_cast<UnicodeString *>(list[i]); 78 } 79 80 inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) { 81 return static_cast<AlphabeticIndex::Bucket *>(list[i]); 82 } 83 84 inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) { 85 return static_cast<AlphabeticIndex::Record *>(list[i]); 86 } 87 88 /** 89 * Like Java Collections.binarySearch(List, String, Comparator). 90 * 91 * @return the index>=0 where the item was found, 92 * or the index<0 for inserting the string at ~index in sorted order 93 */ 94 int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) { 95 if (list.size() == 0) { return ~0; } 96 int32_t start = 0; 97 int32_t limit = list.size(); 98 for (;;) { 99 int32_t i = (start + limit) / 2; 100 const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i)); 101 UErrorCode errorCode = U_ZERO_ERROR; 102 UCollationResult cmp = coll.compare(s, *si, errorCode); 103 if (cmp == UCOL_EQUAL) { 104 return i; 105 } else if (cmp < 0) { 106 if (i == start) { 107 return ~start; // insert s before *si 108 } 109 limit = i; 110 } else { 111 if (i == start) { 112 return ~(start + 1); // insert s after *si 113 } 114 start = i; 115 } 116 } 117 } 118 119 } // namespace 120 121 // The BucketList is not in the anonymous namespace because only Clang 122 // seems to support its use in other classes from there. 123 // However, we also don't need U_I18N_API because it is not used from outside the i18n library. 124 class BucketList : public UObject { 125 public: 126 BucketList(UVector *bucketList, UVector *publicBucketList) 127 : bucketList_(bucketList), immutableVisibleList_(publicBucketList) { 128 int32_t displayIndex = 0; 129 for (int32_t i = 0; i < publicBucketList->size(); ++i) { 130 getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++; 131 } 132 } 133 134 // The virtual destructor must not be inline. 135 // See ticket #8454 for details. 136 virtual ~BucketList(); 137 138 int32_t getBucketCount() const { 139 return immutableVisibleList_->size(); 140 } 141 142 int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly, 143 UErrorCode &errorCode) { 144 // binary search 145 int32_t start = 0; 146 int32_t limit = bucketList_->size(); 147 while ((start + 1) < limit) { 148 int32_t i = (start + limit) / 2; 149 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i); 150 UCollationResult nameVsBucket = 151 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode); 152 if (nameVsBucket < 0) { 153 limit = i; 154 } else { 155 start = i; 156 } 157 } 158 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start); 159 if (bucket->displayBucket_ != NULL) { 160 bucket = bucket->displayBucket_; 161 } 162 return bucket->displayIndex_; 163 } 164 165 /** All of the buckets, visible and invisible. */ 166 UVector *bucketList_; 167 /** Just the visible buckets. */ 168 UVector *immutableVisibleList_; 169 }; 170 171 BucketList::~BucketList() { 172 delete bucketList_; 173 if (immutableVisibleList_ != bucketList_) { 174 delete immutableVisibleList_; 175 } 176 } 177 178 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() { 179 delete buckets_; 180 delete collatorPrimaryOnly_; 181 } 182 183 int32_t 184 AlphabeticIndex::ImmutableIndex::getBucketCount() const { 185 return buckets_->getBucketCount(); 186 } 187 188 int32_t 189 AlphabeticIndex::ImmutableIndex::getBucketIndex( 190 const UnicodeString &name, UErrorCode &errorCode) const { 191 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode); 192 } 193 194 const AlphabeticIndex::Bucket * 195 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const { 196 if (0 <= index && index < buckets_->getBucketCount()) { 197 return icu::getBucket(*buckets_->immutableVisibleList_, index); 198 } else { 199 return NULL; 200 } 201 } 202 203 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) 204 : inputList_(NULL), 205 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), 206 maxLabelCount_(99), 207 initialLabels_(NULL), firstCharsInScripts_(NULL), 208 collator_(NULL), collatorPrimaryOnly_(NULL), 209 buckets_(NULL) { 210 init(&locale, status); 211 } 212 213 214 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status) 215 : inputList_(NULL), 216 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), 217 maxLabelCount_(99), 218 initialLabels_(NULL), firstCharsInScripts_(NULL), 219 collator_(collator), collatorPrimaryOnly_(NULL), 220 buckets_(NULL) { 221 init(NULL, status); 222 } 223 224 225 226 AlphabeticIndex::~AlphabeticIndex() { 227 delete collator_; 228 delete collatorPrimaryOnly_; 229 delete firstCharsInScripts_; 230 delete buckets_; 231 delete inputList_; 232 delete initialLabels_; 233 } 234 235 236 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { 237 if (U_FAILURE(status)) { 238 return *this; 239 } 240 initialLabels_->addAll(additions); 241 clearBuckets(); 242 return *this; 243 } 244 245 246 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { 247 addIndexExemplars(locale, status); 248 clearBuckets(); 249 return *this; 250 } 251 252 253 AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) { 254 if (U_FAILURE(errorCode)) { return NULL; } 255 // In C++, the ImmutableIndex must own its copy of the BucketList, 256 // even if it contains no records, for proper memory management. 257 // We could clone the buckets_ if they are not NULL, 258 // but that would be worth it only if this method is called multiple times, 259 // or called after using the old-style bucket iterator API. 260 LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode)); 261 LocalPointer<RuleBasedCollator> coll( 262 static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone())); 263 if (immutableBucketList.isNull() || coll.isNull()) { 264 errorCode = U_MEMORY_ALLOCATION_ERROR; 265 return NULL; 266 } 267 ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias()); 268 if (immIndex == NULL) { 269 errorCode = U_MEMORY_ALLOCATION_ERROR; 270 return NULL; 271 } 272 // The ImmutableIndex adopted its parameter objects. 273 immutableBucketList.orphan(); 274 coll.orphan(); 275 return immIndex; 276 } 277 278 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { 279 initBuckets(status); 280 if (U_FAILURE(status)) { 281 return 0; 282 } 283 return buckets_->getBucketCount(); 284 } 285 286 287 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { 288 if (U_FAILURE(status) || inputList_ == NULL) { 289 return 0; 290 } 291 return inputList_->size(); 292 } 293 294 void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const { 295 const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode); 296 if (U_FAILURE(errorCode)) { return; } 297 298 const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0); 299 const UnicodeString &overflowBoundary = 300 *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1); 301 302 // We make a sorted array of elements. 303 // Some of the input may be redundant. 304 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". 305 // We filter out those cases. 306 UnicodeSetIterator iter(*initialLabels_); 307 while (iter.next()) { 308 const UnicodeString *item = &iter.getString(); 309 LocalPointer<UnicodeString> ownedItem; 310 UBool checkDistinct; 311 int32_t itemLength = item->length(); 312 if (!item->hasMoreChar32Than(0, itemLength, 1)) { 313 checkDistinct = FALSE; 314 } else if(item->charAt(itemLength - 1) == 0x2a && // '*' 315 item->charAt(itemLength - 2) != 0x2a) { 316 // Use a label if it is marked with one trailing star, 317 // even if the label string sorts the same when all contractions are suppressed. 318 ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1)); 319 item = ownedItem.getAlias(); 320 if (item == NULL) { 321 errorCode = U_MEMORY_ALLOCATION_ERROR; 322 return; 323 } 324 checkDistinct = FALSE; 325 } else { 326 checkDistinct = TRUE; 327 } 328 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { 329 // Ignore a primary-ignorable or non-alphabetic index character. 330 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { 331 // Ignore an index character that will land in the overflow bucket. 332 } else if (checkDistinct && 333 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { 334 // Ignore a multi-code point index character that does not sort distinctly 335 // from the sequence of its separate characters. 336 } else { 337 int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_); 338 if (insertionPoint < 0) { 339 indexCharacters.insertElementAt( 340 ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode); 341 } else { 342 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); 343 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) { 344 indexCharacters.setElementAt( 345 ownedString(*item, ownedItem, errorCode), insertionPoint); 346 } 347 } 348 } 349 } 350 if (U_FAILURE(errorCode)) { return; } 351 352 // if the result is still too large, cut down to maxLabelCount_ elements, by removing every nth element 353 354 int32_t size = indexCharacters.size() - 1; 355 if (size > maxLabelCount_) { 356 int32_t count = 0; 357 int32_t old = -1; 358 for (int32_t i = 0; i < indexCharacters.size();) { 359 ++count; 360 int32_t bump = count * maxLabelCount_ / size; 361 if (bump == old) { 362 indexCharacters.removeElementAt(i); 363 } else { 364 old = bump; 365 ++i; 366 } 367 } 368 } 369 } 370 371 namespace { 372 373 const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) { 374 if (!current.startsWith(BASE, BASE_LENGTH)) { 375 return current; 376 } 377 UChar rest = current.charAt(BASE_LENGTH); 378 if (0x2800 < rest && rest <= 0x28FF) { // stroke count 379 int32_t count = rest-0x2800; 380 temp.setTo((UChar)(0x30 + count % 10)); 381 if (count >= 10) { 382 count /= 10; 383 temp.insert(0, (UChar)(0x30 + count % 10)); 384 if (count >= 10) { 385 count /= 10; 386 temp.insert(0, (UChar)(0x30 + count)); 387 } 388 } 389 return temp.append((UChar)0x5283); 390 } 391 return temp.setTo(current, BASE_LENGTH); 392 } 393 394 UBool hasMultiplePrimaryWeights( 395 const RuleBasedCollator &coll, uint32_t variableTop, 396 const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) { 397 ces.removeAllElements(); 398 coll.internalGetCEs(s, ces, errorCode); 399 if (U_FAILURE(errorCode)) { return FALSE; } 400 UBool seenPrimary = FALSE; 401 for (int32_t i = 0; i < ces.size(); ++i) { 402 int64_t ce = ces.elementAti(i); 403 uint32_t p = (uint32_t)(ce >> 32); 404 if (p > variableTop) { 405 // not primary ignorable 406 if (seenPrimary) { 407 return TRUE; 408 } 409 seenPrimary = TRUE; 410 } 411 } 412 return FALSE; 413 } 414 415 } // namespace 416 417 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { 418 // Initialize indexCharacters. 419 UVector indexCharacters(errorCode); 420 indexCharacters.setDeleter(uprv_deleteUObject); 421 initLabels(indexCharacters, errorCode); 422 if (U_FAILURE(errorCode)) { return NULL; } 423 424 // Variables for hasMultiplePrimaryWeights(). 425 UVector64 ces(errorCode); 426 uint32_t variableTop; 427 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) { 428 variableTop = collatorPrimaryOnly_->getVariableTop(errorCode); 429 } else { 430 variableTop = 0; 431 } 432 UBool hasInvisibleBuckets = FALSE; 433 434 // Helper arrays for Chinese Pinyin collation. 435 Bucket *asciiBuckets[26] = { 436 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 437 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL 438 }; 439 Bucket *pinyinBuckets[26] = { 440 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 441 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL 442 }; 443 UBool hasPinyin = FALSE; 444 445 LocalPointer<UVector> bucketList(new UVector(errorCode), errorCode); 446 if (U_FAILURE(errorCode)) { 447 return NULL; 448 } 449 bucketList->setDeleter(uprv_deleteUObject); 450 451 // underflow bucket 452 Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW); 453 if (bucket == NULL) { 454 errorCode = U_MEMORY_ALLOCATION_ERROR; 455 return NULL; 456 } 457 bucketList->addElement(bucket, errorCode); 458 if (U_FAILURE(errorCode)) { return NULL; } 459 460 UnicodeString temp; 461 462 // fix up the list, adding underflow, additions, overflow 463 // Insert inflow labels as needed. 464 int32_t scriptIndex = -1; 465 const UnicodeString *scriptUpperBoundary = &emptyString_; 466 for (int32_t i = 0; i < indexCharacters.size(); ++i) { 467 UnicodeString ¤t = *getString(indexCharacters, i); 468 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) { 469 // We crossed the script boundary into a new script. 470 const UnicodeString &inflowBoundary = *scriptUpperBoundary; 471 UBool skippedScript = FALSE; 472 for (;;) { 473 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex); 474 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) { 475 break; 476 } 477 skippedScript = TRUE; 478 } 479 if (skippedScript && bucketList->size() > 1) { 480 // We are skipping one or more scripts, 481 // and we are not just getting out of the underflow label. 482 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW); 483 if (bucket == NULL) { 484 errorCode = U_MEMORY_ALLOCATION_ERROR; 485 return NULL; 486 } 487 bucketList->addElement(bucket, errorCode); 488 } 489 } 490 // Add a bucket with the current label. 491 bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL); 492 if (bucket == NULL) { 493 errorCode = U_MEMORY_ALLOCATION_ERROR; 494 return NULL; 495 } 496 bucketList->addElement(bucket, errorCode); 497 // Remember ASCII and Pinyin buckets for Pinyin redirects. 498 UChar c; 499 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z 500 asciiBuckets[c - 0x41] = bucket; 501 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) && 502 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { 503 pinyinBuckets[c - 0x41] = bucket; 504 hasPinyin = TRUE; 505 } 506 // Check for multiple primary weights. 507 if (!current.startsWith(BASE, BASE_LENGTH) && 508 hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current, 509 ces, errorCode) && 510 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) { 511 // "AE-ligature" or "Sch" etc. 512 for (int32_t i = bucketList->size() - 2;; --i) { 513 Bucket *singleBucket = getBucket(*bucketList, i); 514 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 515 // There is no single-character bucket since the last 516 // underflow or inflow label. 517 break; 518 } 519 if (singleBucket->displayBucket_ == NULL && 520 !hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, 521 singleBucket->lowerBoundary_, 522 ces, errorCode)) { 523 // Add an invisible bucket that redirects strings greater than the expansion 524 // to the previous single-character bucket. 525 // For example, after ... Q R S Sch we add Sch\uFFFF->S 526 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. 527 bucket = new Bucket(emptyString_, 528 UnicodeString(current).append((UChar)0xFFFF), 529 U_ALPHAINDEX_NORMAL); 530 if (bucket == NULL) { 531 errorCode = U_MEMORY_ALLOCATION_ERROR; 532 return NULL; 533 } 534 bucket->displayBucket_ = singleBucket; 535 bucketList->addElement(bucket, errorCode); 536 hasInvisibleBuckets = TRUE; 537 break; 538 } 539 } 540 } 541 } 542 if (U_FAILURE(errorCode)) { return NULL; } 543 if (bucketList->size() == 1) { 544 // No real labels, show only the underflow label. 545 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); 546 if (bl == NULL) { 547 errorCode = U_MEMORY_ALLOCATION_ERROR; 548 return NULL; 549 } 550 bucketList.orphan(); 551 return bl; 552 } 553 // overflow bucket 554 bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW); 555 if (bucket == NULL) { 556 errorCode = U_MEMORY_ALLOCATION_ERROR; 557 return NULL; 558 } 559 bucketList->addElement(bucket, errorCode); // final 560 561 if (hasPinyin) { 562 // Redirect Pinyin buckets. 563 Bucket *asciiBucket = NULL; 564 for (int32_t i = 0; i < 26; ++i) { 565 if (asciiBuckets[i] != NULL) { 566 asciiBucket = asciiBuckets[i]; 567 } 568 if (pinyinBuckets[i] != NULL && asciiBucket != NULL) { 569 pinyinBuckets[i]->displayBucket_ = asciiBucket; 570 hasInvisibleBuckets = TRUE; 571 } 572 } 573 } 574 575 if (U_FAILURE(errorCode)) { return NULL; } 576 if (!hasInvisibleBuckets) { 577 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); 578 if (bl == NULL) { 579 errorCode = U_MEMORY_ALLOCATION_ERROR; 580 return NULL; 581 } 582 bucketList.orphan(); 583 return bl; 584 } 585 // Merge inflow buckets that are visually adjacent. 586 // Iterate backwards: Merge inflow into overflow rather than the other way around. 587 int32_t i = bucketList->size() - 1; 588 Bucket *nextBucket = getBucket(*bucketList, i); 589 while (--i > 0) { 590 bucket = getBucket(*bucketList, i); 591 if (bucket->displayBucket_ != NULL) { 592 continue; // skip invisible buckets 593 } 594 if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) { 595 if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 596 bucket->displayBucket_ = nextBucket; 597 continue; 598 } 599 } 600 nextBucket = bucket; 601 } 602 603 LocalPointer<UVector> publicBucketList(new UVector(errorCode), errorCode); 604 if (U_FAILURE(errorCode)) { 605 return NULL; 606 } 607 // Do not call publicBucketList->setDeleter(): 608 // This vector shares its objects with the bucketList. 609 for (int32_t i = 0; i < bucketList->size(); ++i) { 610 bucket = getBucket(*bucketList, i); 611 if (bucket->displayBucket_ == NULL) { 612 publicBucketList->addElement(bucket, errorCode); 613 } 614 } 615 if (U_FAILURE(errorCode)) { return NULL; } 616 BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias()); 617 if (bl == NULL) { 618 errorCode = U_MEMORY_ALLOCATION_ERROR; 619 return NULL; 620 } 621 bucketList.orphan(); 622 publicBucketList.orphan(); 623 return bl; 624 } 625 626 /** 627 * Creates an index, and buckets and sorts the list of records into the index. 628 */ 629 void AlphabeticIndex::initBuckets(UErrorCode &errorCode) { 630 if (U_FAILURE(errorCode) || buckets_ != NULL) { 631 return; 632 } 633 buckets_ = createBucketList(errorCode); 634 if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) { 635 return; 636 } 637 638 // Sort the records by name. 639 // Stable sort preserves input order of collation duplicates. 640 inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode); 641 642 // Now, we traverse all of the input, which is now sorted. 643 // If the item doesn't go in the current bucket, we find the next bucket that contains it. 644 // This makes the process order n*log(n), since we just sort the list and then do a linear process. 645 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so 646 // we need to improve it for that case. 647 648 Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0); 649 int32_t bucketIndex = 1; 650 Bucket *nextBucket; 651 const UnicodeString *upperBoundary; 652 if (bucketIndex < buckets_->bucketList_->size()) { 653 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); 654 upperBoundary = &nextBucket->lowerBoundary_; 655 } else { 656 nextBucket = NULL; 657 upperBoundary = NULL; 658 } 659 for (int32_t i = 0; i < inputList_->size(); ++i) { 660 Record *r = getRecord(*inputList_, i); 661 // if the current bucket isn't the right one, find the one that is 662 // We have a special flag for the last bucket so that we don't look any further 663 while (upperBoundary != NULL && 664 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) { 665 currentBucket = nextBucket; 666 // now reset the boundary that we compare against 667 if (bucketIndex < buckets_->bucketList_->size()) { 668 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); 669 upperBoundary = &nextBucket->lowerBoundary_; 670 } else { 671 upperBoundary = NULL; 672 } 673 } 674 // now put the record into the bucket. 675 Bucket *bucket = currentBucket; 676 if (bucket->displayBucket_ != NULL) { 677 bucket = bucket->displayBucket_; 678 } 679 if (bucket->records_ == NULL) { 680 bucket->records_ = new UVector(errorCode); 681 if (bucket->records_ == NULL) { 682 errorCode = U_MEMORY_ALLOCATION_ERROR; 683 return; 684 } 685 } 686 bucket->records_->addElement(r, errorCode); 687 } 688 } 689 690 void AlphabeticIndex::clearBuckets() { 691 if (buckets_ != NULL) { 692 delete buckets_; 693 buckets_ = NULL; 694 internalResetBucketIterator(); 695 } 696 } 697 698 void AlphabeticIndex::internalResetBucketIterator() { 699 labelsIterIndex_ = -1; 700 currentBucket_ = NULL; 701 } 702 703 704 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { 705 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); 706 if (U_FAILURE(status)) { 707 return; 708 } 709 710 UnicodeSet exemplars; 711 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); 712 if (U_SUCCESS(status)) { 713 initialLabels_->addAll(exemplars); 714 return; 715 } 716 status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR 717 718 // The locale data did not include explicit Index characters. 719 // Synthesize a set of them from the locale's standard exemplar characters. 720 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); 721 if (U_FAILURE(status)) { 722 return; 723 } 724 725 // question: should we add auxiliary exemplars? 726 if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) { 727 exemplars.add(0x61, 0x7A); 728 } 729 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables 730 // cut down to small list 731 exemplars.remove(0xAC00, 0xD7A3). 732 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). 733 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). 734 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). 735 add(0xD30C).add(0xD558); 736 } 737 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block 738 // cut down to small list 739 // make use of the fact that Ethiopic is allocated in 8's, where 740 // the base is 0 mod 8. 741 UnicodeSet ethiopic( 742 UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status); 743 UnicodeSetIterator it(ethiopic); 744 while (it.next() && !it.isString()) { 745 if ((it.getCodepoint() & 0x7) != 0) { 746 exemplars.remove(it.getCodepoint()); 747 } 748 } 749 } 750 751 // Upper-case any that aren't already so. 752 // (We only do this for synthesized index characters.) 753 UnicodeSetIterator it(exemplars); 754 UnicodeString upperC; 755 while (it.next()) { 756 const UnicodeString &exemplarC = it.getString(); 757 upperC = exemplarC; 758 upperC.toUpper(locale); 759 initialLabels_->add(upperC); 760 } 761 } 762 763 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { 764 UnicodeSet contractions; 765 collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode); 766 if (U_FAILURE(errorCode) || contractions.isEmpty()) { return FALSE; } 767 initialLabels_->addAll(contractions); 768 UnicodeSetIterator iter(contractions); 769 while (iter.next()) { 770 const UnicodeString &s = iter.getString(); 771 U_ASSERT (s.startsWith(BASE, BASE_LENGTH)); 772 UChar c = s.charAt(s.length() - 1); 773 if (0x41 <= c && c <= 0x5A) { // A-Z 774 // There are Pinyin labels, add ASCII A-Z labels as well. 775 initialLabels_->add(0x41, 0x5A); // A-Z 776 break; 777 } 778 } 779 return TRUE; 780 } 781 782 783 /* 784 * Return the string with interspersed CGJs. Input must have more than 2 codepoints. 785 */ 786 static const UChar CGJ = 0x034F; 787 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { 788 UnicodeString result; 789 if (item.length() == 0) { 790 return result; 791 } 792 int32_t i = 0; 793 for (;;) { 794 UChar32 cp = item.char32At(i); 795 result.append(cp); 796 i = item.moveIndex32(i, 1); 797 if (i >= item.length()) { 798 break; 799 } 800 result.append(CGJ); 801 } 802 return result; 803 } 804 805 806 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { 807 return FALSE; 808 } 809 810 811 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { 812 return FALSE; 813 } 814 815 816 const RuleBasedCollator &AlphabeticIndex::getCollator() const { 817 return *collator_; 818 } 819 820 821 const UnicodeString &AlphabeticIndex::getInflowLabel() const { 822 return inflowLabel_; 823 } 824 825 const UnicodeString &AlphabeticIndex::getOverflowLabel() const { 826 return overflowLabel_; 827 } 828 829 830 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { 831 return underflowLabel_; 832 } 833 834 835 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 836 inflowLabel_ = label; 837 clearBuckets(); 838 return *this; 839 } 840 841 842 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 843 overflowLabel_ = label; 844 clearBuckets(); 845 return *this; 846 } 847 848 849 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 850 underflowLabel_ = label; 851 clearBuckets(); 852 return *this; 853 } 854 855 856 int32_t AlphabeticIndex::getMaxLabelCount() const { 857 return maxLabelCount_; 858 } 859 860 861 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { 862 if (U_FAILURE(status)) { 863 return *this; 864 } 865 if (maxLabelCount <= 0) { 866 status = U_ILLEGAL_ARGUMENT_ERROR; 867 return *this; 868 } 869 maxLabelCount_ = maxLabelCount; 870 clearBuckets(); 871 return *this; 872 } 873 874 875 // 876 // init() - Common code for constructors. 877 // 878 879 void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { 880 if (U_FAILURE(status)) { return; } 881 if (locale == NULL && collator_ == NULL) { 882 status = U_ILLEGAL_ARGUMENT_ERROR; 883 return; 884 } 885 886 initialLabels_ = new UnicodeSet(); 887 if (initialLabels_ == NULL) { 888 status = U_MEMORY_ALLOCATION_ERROR; 889 return; 890 } 891 892 inflowLabel_.setTo((UChar)0x2026); // Ellipsis 893 overflowLabel_ = inflowLabel_; 894 underflowLabel_ = inflowLabel_; 895 896 if (collator_ == NULL) { 897 Collator *coll = Collator::createInstance(*locale, status); 898 if (U_FAILURE(status)) { 899 delete coll; 900 return; 901 } 902 if (coll == NULL) { 903 status = U_MEMORY_ALLOCATION_ERROR; 904 return; 905 } 906 collator_ = dynamic_cast<RuleBasedCollator *>(coll); 907 if (collator_ == NULL) { 908 delete coll; 909 status = U_UNSUPPORTED_ERROR; 910 return; 911 } 912 } 913 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone()); 914 if (collatorPrimaryOnly_ == NULL) { 915 status = U_MEMORY_ALLOCATION_ERROR; 916 return; 917 } 918 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); 919 firstCharsInScripts_ = firstStringsInScript(status); 920 if (U_FAILURE(status)) { return; } 921 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); 922 // Guard against a degenerate collator where 923 // some script boundary strings are primary ignorable. 924 for (;;) { 925 if (U_FAILURE(status)) { return; } 926 if (firstCharsInScripts_->isEmpty()) { 927 // AlphabeticIndex requires some non-ignorable script boundary strings. 928 status = U_ILLEGAL_ARGUMENT_ERROR; 929 return; 930 } 931 if (collatorPrimaryOnly_->compare( 932 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)), 933 emptyString_, status) == UCOL_EQUAL) { 934 firstCharsInScripts_->removeElementAt(0); 935 } else { 936 break; 937 } 938 } 939 940 // Chinese index characters, which are specific to each of the several Chinese tailorings, 941 // take precedence over the single locale data exemplar set per language. 942 if (!addChineseIndexCharacters(status) && locale != NULL) { 943 addIndexExemplars(*locale, status); 944 } 945 } 946 947 948 // 949 // Comparison function for UVector<UnicodeString *> sorting with a collator. 950 // 951 static int32_t U_CALLCONV 952 collatorComparator(const void *context, const void *left, const void *right) { 953 const UElement *leftElement = static_cast<const UElement *>(left); 954 const UElement *rightElement = static_cast<const UElement *>(right); 955 const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer); 956 const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer); 957 958 if (leftString == rightString) { 959 // Catches case where both are NULL 960 return 0; 961 } 962 if (leftString == NULL) { 963 return 1; 964 }; 965 if (rightString == NULL) { 966 return -1; 967 } 968 const Collator *col = static_cast<const Collator *>(context); 969 UErrorCode errorCode = U_ZERO_ERROR; 970 return col->compare(*leftString, *rightString, errorCode); 971 } 972 973 // 974 // Comparison function for UVector<Record *> sorting with a collator. 975 // 976 static int32_t U_CALLCONV 977 recordCompareFn(const void *context, const void *left, const void *right) { 978 const UElement *leftElement = static_cast<const UElement *>(left); 979 const UElement *rightElement = static_cast<const UElement *>(right); 980 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer); 981 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer); 982 const Collator *col = static_cast<const Collator *>(context); 983 UErrorCode errorCode = U_ZERO_ERROR; 984 return col->compare(leftRec->name_, rightRec->name_, errorCode); 985 } 986 987 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { 988 if (U_FAILURE(status)) { 989 return NULL; 990 } 991 LocalPointer<UVector> dest(new UVector(status), status); 992 if (U_FAILURE(status)) { 993 return NULL; 994 } 995 dest->setDeleter(uprv_deleteUObject); 996 // Fetch the script-first-primary contractions which are defined in the root collator. 997 // They all start with U+FDD1. 998 UnicodeSet set; 999 collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status); 1000 if (U_FAILURE(status)) { 1001 return NULL; 1002 } 1003 if (set.isEmpty()) { 1004 status = U_UNSUPPORTED_ERROR; 1005 return NULL; 1006 } 1007 UnicodeSetIterator iter(set); 1008 while (iter.next()) { 1009 const UnicodeString &boundary = iter.getString(); 1010 uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1)); 1011 if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) { 1012 // Ignore boundaries for the special reordering groups. 1013 // Take only those for "real scripts" (where the sample character is a Letter, 1014 // and the one for unassigned implicit weights (Cn). 1015 continue; 1016 } 1017 UnicodeString *s = new UnicodeString(boundary); 1018 if (s == NULL) { 1019 status = U_MEMORY_ALLOCATION_ERROR; 1020 return NULL; 1021 } 1022 dest->addElement(s, status); 1023 } 1024 return dest.orphan(); 1025 } 1026 1027 1028 namespace { 1029 1030 /** 1031 * Returns true if one index character string is "better" than the other. 1032 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is 1033 * better, and otherwise binary-less-than is better. 1034 */ 1035 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, 1036 const UnicodeString &one, const UnicodeString &other) { 1037 // This is called with primary-equal strings, but never with one.equals(other). 1038 UErrorCode status = U_ZERO_ERROR; 1039 UnicodeString n1 = nfkdNormalizer.normalize(one, status); 1040 UnicodeString n2 = nfkdNormalizer.normalize(other, status); 1041 if (U_FAILURE(status)) { return FALSE; } 1042 int32_t result = n1.countChar32() - n2.countChar32(); 1043 if (result != 0) { 1044 return result < 0; 1045 } 1046 result = n1.compareCodePointOrder(n2); 1047 if (result != 0) { 1048 return result < 0; 1049 } 1050 return one.compareCodePointOrder(other) < 0; 1051 } 1052 1053 } // namespace 1054 1055 // 1056 // Constructor & Destructor for AlphabeticIndex::Record 1057 // 1058 // Records are internal only, instances are not directly surfaced in the public API. 1059 // This class is mostly struct-like, with all public fields. 1060 1061 AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data) 1062 : name_(name), data_(data) {} 1063 1064 AlphabeticIndex::Record::~Record() { 1065 } 1066 1067 1068 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { 1069 if (U_FAILURE(status)) { 1070 return *this; 1071 } 1072 if (inputList_ == NULL) { 1073 inputList_ = new UVector(status); 1074 if (inputList_ == NULL) { 1075 status = U_MEMORY_ALLOCATION_ERROR; 1076 return *this; 1077 } 1078 inputList_->setDeleter(alphaIndex_deleteRecord); 1079 } 1080 Record *r = new Record(name, data); 1081 if (r == NULL) { 1082 status = U_MEMORY_ALLOCATION_ERROR; 1083 return *this; 1084 } 1085 inputList_->addElement(r, status); 1086 clearBuckets(); 1087 //std::string ss; 1088 //std::string ss2; 1089 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << 1090 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; 1091 return *this; 1092 } 1093 1094 1095 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { 1096 if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) { 1097 inputList_->removeAllElements(); 1098 clearBuckets(); 1099 } 1100 return *this; 1101 } 1102 1103 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { 1104 initBuckets(status); 1105 if (U_FAILURE(status)) { 1106 return 0; 1107 } 1108 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status); 1109 } 1110 1111 1112 int32_t AlphabeticIndex::getBucketIndex() const { 1113 return labelsIterIndex_; 1114 } 1115 1116 1117 UBool AlphabeticIndex::nextBucket(UErrorCode &status) { 1118 if (U_FAILURE(status)) { 1119 return FALSE; 1120 } 1121 if (buckets_ == NULL && currentBucket_ != NULL) { 1122 status = U_ENUM_OUT_OF_SYNC_ERROR; 1123 return FALSE; 1124 } 1125 initBuckets(status); 1126 if (U_FAILURE(status)) { 1127 return FALSE; 1128 } 1129 ++labelsIterIndex_; 1130 if (labelsIterIndex_ >= buckets_->getBucketCount()) { 1131 labelsIterIndex_ = buckets_->getBucketCount(); 1132 return FALSE; 1133 } 1134 currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_); 1135 resetRecordIterator(); 1136 return TRUE; 1137 } 1138 1139 const UnicodeString &AlphabeticIndex::getBucketLabel() const { 1140 if (currentBucket_ != NULL) { 1141 return currentBucket_->label_; 1142 } else { 1143 return emptyString_; 1144 } 1145 } 1146 1147 1148 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { 1149 if (currentBucket_ != NULL) { 1150 return currentBucket_->labelType_; 1151 } else { 1152 return U_ALPHAINDEX_NORMAL; 1153 } 1154 } 1155 1156 1157 int32_t AlphabeticIndex::getBucketRecordCount() const { 1158 if (currentBucket_ != NULL && currentBucket_->records_ != NULL) { 1159 return currentBucket_->records_->size(); 1160 } else { 1161 return 0; 1162 } 1163 } 1164 1165 1166 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { 1167 if (U_FAILURE(status)) { 1168 return *this; 1169 } 1170 internalResetBucketIterator(); 1171 return *this; 1172 } 1173 1174 1175 UBool AlphabeticIndex::nextRecord(UErrorCode &status) { 1176 if (U_FAILURE(status)) { 1177 return FALSE; 1178 } 1179 if (currentBucket_ == NULL) { 1180 // We are trying to iterate over the items in a bucket, but there is no 1181 // current bucket from the enumeration of buckets. 1182 status = U_INVALID_STATE_ERROR; 1183 return FALSE; 1184 } 1185 if (buckets_ == NULL) { 1186 status = U_ENUM_OUT_OF_SYNC_ERROR; 1187 return FALSE; 1188 } 1189 if (currentBucket_->records_ == NULL) { 1190 return FALSE; 1191 } 1192 ++itemsIterIndex_; 1193 if (itemsIterIndex_ >= currentBucket_->records_->size()) { 1194 itemsIterIndex_ = currentBucket_->records_->size(); 1195 return FALSE; 1196 } 1197 return TRUE; 1198 } 1199 1200 1201 const UnicodeString &AlphabeticIndex::getRecordName() const { 1202 const UnicodeString *retStr = &emptyString_; 1203 if (currentBucket_ != NULL && currentBucket_->records_ != NULL && 1204 itemsIterIndex_ >= 0 && 1205 itemsIterIndex_ < currentBucket_->records_->size()) { 1206 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); 1207 retStr = &item->name_; 1208 } 1209 return *retStr; 1210 } 1211 1212 const void *AlphabeticIndex::getRecordData() const { 1213 const void *retPtr = NULL; 1214 if (currentBucket_ != NULL && currentBucket_->records_ != NULL && 1215 itemsIterIndex_ >= 0 && 1216 itemsIterIndex_ < currentBucket_->records_->size()) { 1217 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); 1218 retPtr = item->data_; 1219 } 1220 return retPtr; 1221 } 1222 1223 1224 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { 1225 itemsIterIndex_ = -1; 1226 return *this; 1227 } 1228 1229 1230 1231 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, 1232 const UnicodeString &lowerBoundary, 1233 UAlphabeticIndexLabelType type) 1234 : label_(label), lowerBoundary_(lowerBoundary), labelType_(type), 1235 displayBucket_(NULL), displayIndex_(-1), 1236 records_(NULL) { 1237 } 1238 1239 1240 AlphabeticIndex::Bucket::~Bucket() { 1241 delete records_; 1242 } 1243 1244 U_NAMESPACE_END 1245 1246 #endif // !UCONFIG_NO_COLLATION 1247