1 /* 2 ******************************************************************************* 3 * Copyright (C) 2009-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_COLLATION 11 12 #include "unicode/alphaindex.h" 13 #include "unicode/coll.h" 14 #include "unicode/localpointer.h" 15 #include "unicode/normalizer2.h" 16 #include "unicode/tblcoll.h" 17 #include "unicode/uchar.h" 18 #include "unicode/ulocdata.h" 19 #include "unicode/uniset.h" 20 #include "unicode/uobject.h" 21 #include "unicode/usetiter.h" 22 #include "unicode/utf16.h" 23 24 #include "cmemory.h" 25 #include "cstring.h" 26 #include "uassert.h" 27 #include "uvector.h" 28 #include "uvectr64.h" 29 30 //#include <string> 31 //#include <iostream> 32 33 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 34 35 U_NAMESPACE_BEGIN 36 37 namespace { 38 39 /** 40 * Prefix string for Chinese index buckets. 41 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes 42 */ 43 const UChar BASE[1] = { 0xFDD0 }; 44 const int32_t BASE_LENGTH = 1; 45 46 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, 47 const UnicodeString &one, const UnicodeString &other); 48 49 } // namespace 50 51 static int32_t U_CALLCONV 52 collatorComparator(const void *context, const void *left, const void *right); 53 54 static int32_t U_CALLCONV 55 recordCompareFn(const void *context, const void *left, const void *right); 56 57 // UVector<Record *> support function, delete a Record. 58 static void U_CALLCONV 59 alphaIndex_deleteRecord(void *obj) { 60 delete static_cast<AlphabeticIndex::Record *>(obj); 61 } 62 63 namespace { 64 65 UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned, 66 UErrorCode &errorCode) { 67 if (U_FAILURE(errorCode)) { return NULL; } 68 if (owned.isValid()) { 69 return owned.orphan(); 70 } 71 UnicodeString *p = new UnicodeString(s); 72 if (p == NULL) { 73 errorCode = U_MEMORY_ALLOCATION_ERROR; 74 } 75 return p; 76 } 77 78 inline UnicodeString *getString(const UVector &list, int32_t i) { 79 return static_cast<UnicodeString *>(list[i]); 80 } 81 82 inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) { 83 return static_cast<AlphabeticIndex::Bucket *>(list[i]); 84 } 85 86 inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) { 87 return static_cast<AlphabeticIndex::Record *>(list[i]); 88 } 89 90 /** 91 * Like Java Collections.binarySearch(List, String, Comparator). 92 * 93 * @return the index>=0 where the item was found, 94 * or the index<0 for inserting the string at ~index in sorted order 95 */ 96 int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) { 97 if (list.size() == 0) { return ~0; } 98 int32_t start = 0; 99 int32_t limit = list.size(); 100 for (;;) { 101 int32_t i = (start + limit) / 2; 102 const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i)); 103 UErrorCode errorCode = U_ZERO_ERROR; 104 UCollationResult cmp = coll.compare(s, *si, errorCode); 105 if (cmp == UCOL_EQUAL) { 106 return i; 107 } else if (cmp < 0) { 108 if (i == start) { 109 return ~start; // insert s before *si 110 } 111 limit = i; 112 } else { 113 if (i == start) { 114 return ~(start + 1); // insert s after *si 115 } 116 start = i; 117 } 118 } 119 } 120 121 } // namespace 122 123 // The BucketList is not in the anonymous namespace because only Clang 124 // seems to support its use in other classes from there. 125 // However, we also don't need U_I18N_API because it is not used from outside the i18n library. 126 class BucketList : public UObject { 127 public: 128 BucketList(UVector *bucketList, UVector *publicBucketList) 129 : bucketList_(bucketList), immutableVisibleList_(publicBucketList) { 130 int32_t displayIndex = 0; 131 for (int32_t i = 0; i < publicBucketList->size(); ++i) { 132 getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++; 133 } 134 } 135 136 // The virtual destructor must not be inline. 137 // See ticket #8454 for details. 138 virtual ~BucketList(); 139 140 int32_t getBucketCount() const { 141 return immutableVisibleList_->size(); 142 } 143 144 int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly, 145 UErrorCode &errorCode) { 146 // binary search 147 int32_t start = 0; 148 int32_t limit = bucketList_->size(); 149 while ((start + 1) < limit) { 150 int32_t i = (start + limit) / 2; 151 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i); 152 UCollationResult nameVsBucket = 153 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode); 154 if (nameVsBucket < 0) { 155 limit = i; 156 } else { 157 start = i; 158 } 159 } 160 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start); 161 if (bucket->displayBucket_ != NULL) { 162 bucket = bucket->displayBucket_; 163 } 164 return bucket->displayIndex_; 165 } 166 167 /** All of the buckets, visible and invisible. */ 168 UVector *bucketList_; 169 /** Just the visible buckets. */ 170 UVector *immutableVisibleList_; 171 }; 172 173 BucketList::~BucketList() { 174 delete bucketList_; 175 if (immutableVisibleList_ != bucketList_) { 176 delete immutableVisibleList_; 177 } 178 } 179 180 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() { 181 delete buckets_; 182 delete collatorPrimaryOnly_; 183 } 184 185 int32_t 186 AlphabeticIndex::ImmutableIndex::getBucketCount() const { 187 return buckets_->getBucketCount(); 188 } 189 190 int32_t 191 AlphabeticIndex::ImmutableIndex::getBucketIndex( 192 const UnicodeString &name, UErrorCode &errorCode) const { 193 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode); 194 } 195 196 const AlphabeticIndex::Bucket * 197 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const { 198 if (0 <= index && index < buckets_->getBucketCount()) { 199 return icu::getBucket(*buckets_->immutableVisibleList_, index); 200 } else { 201 return NULL; 202 } 203 } 204 205 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) 206 : inputList_(NULL), 207 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), 208 maxLabelCount_(99), 209 initialLabels_(NULL), firstCharsInScripts_(NULL), 210 collator_(NULL), collatorPrimaryOnly_(NULL), 211 buckets_(NULL) { 212 init(&locale, status); 213 } 214 215 216 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status) 217 : inputList_(NULL), 218 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), 219 maxLabelCount_(99), 220 initialLabels_(NULL), firstCharsInScripts_(NULL), 221 collator_(collator), collatorPrimaryOnly_(NULL), 222 buckets_(NULL) { 223 init(NULL, status); 224 } 225 226 227 228 AlphabeticIndex::~AlphabeticIndex() { 229 delete collator_; 230 delete collatorPrimaryOnly_; 231 delete firstCharsInScripts_; 232 delete buckets_; 233 delete inputList_; 234 delete initialLabels_; 235 } 236 237 238 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { 239 if (U_FAILURE(status)) { 240 return *this; 241 } 242 initialLabels_->addAll(additions); 243 clearBuckets(); 244 return *this; 245 } 246 247 248 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { 249 addIndexExemplars(locale, status); 250 clearBuckets(); 251 return *this; 252 } 253 254 255 AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) { 256 if (U_FAILURE(errorCode)) { return NULL; } 257 // In C++, the ImmutableIndex must own its copy of the BucketList, 258 // even if it contains no records, for proper memory management. 259 // We could clone the buckets_ if they are not NULL, 260 // but that would be worth it only if this method is called multiple times, 261 // or called after using the old-style bucket iterator API. 262 LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode)); 263 LocalPointer<RuleBasedCollator> coll( 264 static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone())); 265 if (immutableBucketList.isNull() || coll.isNull()) { 266 errorCode = U_MEMORY_ALLOCATION_ERROR; 267 return NULL; 268 } 269 ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias()); 270 if (immIndex == NULL) { 271 errorCode = U_MEMORY_ALLOCATION_ERROR; 272 return NULL; 273 } 274 // The ImmutableIndex adopted its parameter objects. 275 immutableBucketList.orphan(); 276 coll.orphan(); 277 return immIndex; 278 } 279 280 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { 281 initBuckets(status); 282 if (U_FAILURE(status)) { 283 return 0; 284 } 285 return buckets_->getBucketCount(); 286 } 287 288 289 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { 290 if (U_FAILURE(status) || inputList_ == NULL) { 291 return 0; 292 } 293 return inputList_->size(); 294 } 295 296 void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const { 297 const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode); 298 if (U_FAILURE(errorCode)) { return; } 299 300 const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0); 301 const UnicodeString &overflowBoundary = 302 *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1); 303 304 // We make a sorted array of elements. 305 // Some of the input may be redundant. 306 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". 307 // We filter out those cases. 308 UnicodeSetIterator iter(*initialLabels_); 309 while (iter.next()) { 310 const UnicodeString *item = &iter.getString(); 311 LocalPointer<UnicodeString> ownedItem; 312 UBool checkDistinct; 313 int32_t itemLength = item->length(); 314 if (!item->hasMoreChar32Than(0, itemLength, 1)) { 315 checkDistinct = FALSE; 316 } else if(item->charAt(itemLength - 1) == 0x2a && // '*' 317 item->charAt(itemLength - 2) != 0x2a) { 318 // Use a label if it is marked with one trailing star, 319 // even if the label string sorts the same when all contractions are suppressed. 320 ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1)); 321 item = ownedItem.getAlias(); 322 if (item == NULL) { 323 errorCode = U_MEMORY_ALLOCATION_ERROR; 324 return; 325 } 326 checkDistinct = FALSE; 327 } else { 328 checkDistinct = TRUE; 329 } 330 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { 331 // Ignore a primary-ignorable or non-alphabetic index character. 332 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { 333 // Ignore an index character that will land in the overflow bucket. 334 } else if (checkDistinct && 335 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { 336 // Ignore a multi-code point index character that does not sort distinctly 337 // from the sequence of its separate characters. 338 } else { 339 int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_); 340 if (insertionPoint < 0) { 341 indexCharacters.insertElementAt( 342 ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode); 343 } else { 344 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); 345 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) { 346 indexCharacters.setElementAt( 347 ownedString(*item, ownedItem, errorCode), insertionPoint); 348 } 349 } 350 } 351 } 352 if (U_FAILURE(errorCode)) { return; } 353 354 // if the result is still too large, cut down to maxCount elements, by removing every nth element 355 356 int32_t size = indexCharacters.size() - 1; 357 if (size > maxLabelCount_) { 358 int32_t count = 0; 359 int32_t old = -1; 360 for (int32_t i = 0; i < indexCharacters.size();) { 361 ++count; 362 int32_t bump = count * maxLabelCount_ / size; 363 if (bump == old) { 364 indexCharacters.removeElementAt(i); 365 } else { 366 old = bump; 367 ++i; 368 } 369 } 370 } 371 } 372 373 namespace { 374 375 const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) { 376 if (!current.startsWith(BASE, BASE_LENGTH)) { 377 return current; 378 } 379 UChar rest = current.charAt(BASE_LENGTH); 380 if (0x2800 < rest && rest <= 0x28FF) { // stroke count 381 int32_t count = rest-0x2800; 382 temp.setTo((UChar)(0x30 + count % 10)); 383 if (count >= 10) { 384 count /= 10; 385 temp.insert(0, (UChar)(0x30 + count % 10)); 386 if (count >= 10) { 387 count /= 10; 388 temp.insert(0, (UChar)(0x30 + count)); 389 } 390 } 391 return temp.append((UChar)0x5283); 392 } 393 return temp.setTo(current, BASE_LENGTH); 394 } 395 396 UBool hasMultiplePrimaryWeights( 397 const RuleBasedCollator &coll, uint32_t variableTop, 398 const UnicodeString &s, UVector64 &ces, UErrorCode &errorCode) { 399 ces.removeAllElements(); 400 coll.internalGetCEs(s, ces, errorCode); 401 if (U_FAILURE(errorCode)) { return FALSE; } 402 UBool seenPrimary = FALSE; 403 for (int32_t i = 0; i < ces.size(); ++i) { 404 int64_t ce = ces.elementAti(i); 405 uint32_t p = (uint32_t)(ce >> 32); 406 if (p > variableTop) { 407 // not primary ignorable 408 if (seenPrimary) { 409 return TRUE; 410 } 411 seenPrimary = TRUE; 412 } 413 } 414 return FALSE; 415 } 416 417 } // namespace 418 419 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { 420 // Initialize indexCharacters. 421 UVector indexCharacters(errorCode); 422 indexCharacters.setDeleter(uprv_deleteUObject); 423 initLabels(indexCharacters, errorCode); 424 if (U_FAILURE(errorCode)) { return NULL; } 425 426 // Variables for hasMultiplePrimaryWeights(). 427 UVector64 ces(errorCode); 428 uint32_t variableTop; 429 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) { 430 variableTop = collatorPrimaryOnly_->getVariableTop(errorCode); 431 } else { 432 variableTop = 0; 433 } 434 UBool hasInvisibleBuckets = FALSE; 435 436 // Helper arrays for Chinese Pinyin collation. 437 Bucket *asciiBuckets[26] = { 438 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 439 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL 440 }; 441 Bucket *pinyinBuckets[26] = { 442 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 443 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL 444 }; 445 UBool hasPinyin = FALSE; 446 447 LocalPointer<UVector> bucketList(new UVector(errorCode)); 448 if (bucketList.isNull()) { 449 errorCode = U_MEMORY_ALLOCATION_ERROR; 450 return NULL; 451 } 452 bucketList->setDeleter(uprv_deleteUObject); 453 454 // underflow bucket 455 Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW); 456 if (bucket == NULL) { 457 errorCode = U_MEMORY_ALLOCATION_ERROR; 458 return NULL; 459 } 460 bucketList->addElement(bucket, errorCode); 461 if (U_FAILURE(errorCode)) { return NULL; } 462 463 UnicodeString temp; 464 465 // fix up the list, adding underflow, additions, overflow 466 // Insert inflow labels as needed. 467 int32_t scriptIndex = -1; 468 const UnicodeString *scriptUpperBoundary = &emptyString_; 469 for (int32_t i = 0; i < indexCharacters.size(); ++i) { 470 UnicodeString ¤t = *getString(indexCharacters, i); 471 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) { 472 // We crossed the script boundary into a new script. 473 const UnicodeString &inflowBoundary = *scriptUpperBoundary; 474 UBool skippedScript = FALSE; 475 for (;;) { 476 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex); 477 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) { 478 break; 479 } 480 skippedScript = TRUE; 481 } 482 if (skippedScript && bucketList->size() > 1) { 483 // We are skipping one or more scripts, 484 // and we are not just getting out of the underflow label. 485 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW); 486 if (bucket == NULL) { 487 errorCode = U_MEMORY_ALLOCATION_ERROR; 488 return NULL; 489 } 490 bucketList->addElement(bucket, errorCode); 491 } 492 } 493 // Add a bucket with the current label. 494 bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL); 495 if (bucket == NULL) { 496 errorCode = U_MEMORY_ALLOCATION_ERROR; 497 return NULL; 498 } 499 bucketList->addElement(bucket, errorCode); 500 // Remember ASCII and Pinyin buckets for Pinyin redirects. 501 UChar c; 502 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z 503 asciiBuckets[c - 0x41] = bucket; 504 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) && 505 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { 506 pinyinBuckets[c - 0x41] = bucket; 507 hasPinyin = TRUE; 508 } 509 // Check for multiple primary weights. 510 if (!current.startsWith(BASE, BASE_LENGTH) && 511 hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, current, 512 ces, errorCode) && 513 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) { 514 // "AE-ligature" or "Sch" etc. 515 for (int32_t i = bucketList->size() - 2;; --i) { 516 Bucket *singleBucket = getBucket(*bucketList, i); 517 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 518 // There is no single-character bucket since the last 519 // underflow or inflow label. 520 break; 521 } 522 if (singleBucket->displayBucket_ == NULL && 523 !hasMultiplePrimaryWeights(*collatorPrimaryOnly_, variableTop, 524 singleBucket->lowerBoundary_, 525 ces, errorCode)) { 526 // Add an invisible bucket that redirects strings greater than the expansion 527 // to the previous single-character bucket. 528 // For example, after ... Q R S Sch we add Sch\uFFFF->S 529 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. 530 bucket = new Bucket(emptyString_, 531 UnicodeString(current).append((UChar)0xFFFF), 532 U_ALPHAINDEX_NORMAL); 533 if (bucket == NULL) { 534 errorCode = U_MEMORY_ALLOCATION_ERROR; 535 return NULL; 536 } 537 bucket->displayBucket_ = singleBucket; 538 bucketList->addElement(bucket, errorCode); 539 hasInvisibleBuckets = TRUE; 540 break; 541 } 542 } 543 } 544 } 545 if (U_FAILURE(errorCode)) { return NULL; } 546 if (bucketList->size() == 1) { 547 // No real labels, show only the underflow label. 548 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); 549 if (bl == NULL) { 550 errorCode = U_MEMORY_ALLOCATION_ERROR; 551 return NULL; 552 } 553 bucketList.orphan(); 554 return bl; 555 } 556 // overflow bucket 557 bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW); 558 if (bucket == NULL) { 559 errorCode = U_MEMORY_ALLOCATION_ERROR; 560 return NULL; 561 } 562 bucketList->addElement(bucket, errorCode); // final 563 564 if (hasPinyin) { 565 // Redirect Pinyin buckets. 566 Bucket *asciiBucket = NULL; 567 for (int32_t i = 0; i < 26; ++i) { 568 if (asciiBuckets[i] != NULL) { 569 asciiBucket = asciiBuckets[i]; 570 } 571 if (pinyinBuckets[i] != NULL && asciiBucket != NULL) { 572 pinyinBuckets[i]->displayBucket_ = asciiBucket; 573 hasInvisibleBuckets = TRUE; 574 } 575 } 576 } 577 578 if (U_FAILURE(errorCode)) { return NULL; } 579 if (!hasInvisibleBuckets) { 580 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); 581 if (bl == NULL) { 582 errorCode = U_MEMORY_ALLOCATION_ERROR; 583 return NULL; 584 } 585 bucketList.orphan(); 586 return bl; 587 } 588 // Merge inflow buckets that are visually adjacent. 589 // Iterate backwards: Merge inflow into overflow rather than the other way around. 590 int32_t i = bucketList->size() - 1; 591 Bucket *nextBucket = getBucket(*bucketList, i); 592 while (--i > 0) { 593 bucket = getBucket(*bucketList, i); 594 if (bucket->displayBucket_ != NULL) { 595 continue; // skip invisible buckets 596 } 597 if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) { 598 if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 599 bucket->displayBucket_ = nextBucket; 600 continue; 601 } 602 } 603 nextBucket = bucket; 604 } 605 606 LocalPointer<UVector> publicBucketList(new UVector(errorCode)); 607 if (bucketList.isNull()) { 608 errorCode = U_MEMORY_ALLOCATION_ERROR; 609 return NULL; 610 } 611 // Do not call publicBucketList->setDeleter(): 612 // This vector shares its objects with the bucketList. 613 for (int32_t i = 0; i < bucketList->size(); ++i) { 614 bucket = getBucket(*bucketList, i); 615 if (bucket->displayBucket_ == NULL) { 616 publicBucketList->addElement(bucket, errorCode); 617 } 618 } 619 if (U_FAILURE(errorCode)) { return NULL; } 620 BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias()); 621 if (bl == NULL) { 622 errorCode = U_MEMORY_ALLOCATION_ERROR; 623 return NULL; 624 } 625 bucketList.orphan(); 626 publicBucketList.orphan(); 627 return bl; 628 } 629 630 /** 631 * Creates an index, and buckets and sorts the list of records into the index. 632 */ 633 void AlphabeticIndex::initBuckets(UErrorCode &errorCode) { 634 if (U_FAILURE(errorCode) || buckets_ != NULL) { 635 return; 636 } 637 buckets_ = createBucketList(errorCode); 638 if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) { 639 return; 640 } 641 642 // Sort the records by name. 643 // Stable sort preserves input order of collation duplicates. 644 inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode); 645 646 // Now, we traverse all of the input, which is now sorted. 647 // If the item doesn't go in the current bucket, we find the next bucket that contains it. 648 // This makes the process order n*log(n), since we just sort the list and then do a linear process. 649 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so 650 // we need to improve it for that case. 651 652 Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0); 653 int32_t bucketIndex = 1; 654 Bucket *nextBucket; 655 const UnicodeString *upperBoundary; 656 if (bucketIndex < buckets_->bucketList_->size()) { 657 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); 658 upperBoundary = &nextBucket->lowerBoundary_; 659 } else { 660 nextBucket = NULL; 661 upperBoundary = NULL; 662 } 663 for (int32_t i = 0; i < inputList_->size(); ++i) { 664 Record *r = getRecord(*inputList_, i); 665 // if the current bucket isn't the right one, find the one that is 666 // We have a special flag for the last bucket so that we don't look any further 667 while (upperBoundary != NULL && 668 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) { 669 currentBucket = nextBucket; 670 // now reset the boundary that we compare against 671 if (bucketIndex < buckets_->bucketList_->size()) { 672 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); 673 upperBoundary = &nextBucket->lowerBoundary_; 674 } else { 675 upperBoundary = NULL; 676 } 677 } 678 // now put the record into the bucket. 679 Bucket *bucket = currentBucket; 680 if (bucket->displayBucket_ != NULL) { 681 bucket = bucket->displayBucket_; 682 } 683 if (bucket->records_ == NULL) { 684 bucket->records_ = new UVector(errorCode); 685 if (bucket->records_ == NULL) { 686 errorCode = U_MEMORY_ALLOCATION_ERROR; 687 return; 688 } 689 } 690 bucket->records_->addElement(r, errorCode); 691 } 692 } 693 694 void AlphabeticIndex::clearBuckets() { 695 if (buckets_ != NULL) { 696 delete buckets_; 697 buckets_ = NULL; 698 internalResetBucketIterator(); 699 } 700 } 701 702 void AlphabeticIndex::internalResetBucketIterator() { 703 labelsIterIndex_ = -1; 704 currentBucket_ = NULL; 705 } 706 707 708 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { 709 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); 710 if (U_FAILURE(status)) { 711 return; 712 } 713 714 UnicodeSet exemplars; 715 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); 716 if (U_SUCCESS(status)) { 717 initialLabels_->addAll(exemplars); 718 return; 719 } 720 status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR 721 722 // The locale data did not include explicit Index characters. 723 // Synthesize a set of them from the locale's standard exemplar characters. 724 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); 725 if (U_FAILURE(status)) { 726 return; 727 } 728 729 // question: should we add auxiliary exemplars? 730 if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) { 731 exemplars.add(0x61, 0x7A); 732 } 733 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables 734 // cut down to small list 735 exemplars.remove(0xAC00, 0xD7A3). 736 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). 737 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). 738 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). 739 add(0xD30C).add(0xD558); 740 } 741 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block 742 // cut down to small list 743 // make use of the fact that Ethiopic is allocated in 8's, where 744 // the base is 0 mod 8. 745 UnicodeSet ethiopic( 746 UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status); 747 UnicodeSetIterator it(ethiopic); 748 while (it.next() && !it.isString()) { 749 if ((it.getCodepoint() & 0x7) != 0) { 750 exemplars.remove(it.getCodepoint()); 751 } 752 } 753 } 754 755 // Upper-case any that aren't already so. 756 // (We only do this for synthesized index characters.) 757 UnicodeSetIterator it(exemplars); 758 UnicodeString upperC; 759 while (it.next()) { 760 const UnicodeString &exemplarC = it.getString(); 761 upperC = exemplarC; 762 upperC.toUpper(locale); 763 initialLabels_->add(upperC); 764 } 765 } 766 767 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { 768 UnicodeSet contractions; 769 collatorPrimaryOnly_->internalAddContractions(BASE[0], contractions, errorCode); 770 if (U_FAILURE(errorCode) || contractions.isEmpty()) { return FALSE; } 771 initialLabels_->addAll(contractions); 772 UnicodeSetIterator iter(contractions); 773 while (iter.next()) { 774 const UnicodeString &s = iter.getString(); 775 U_ASSERT (s.startsWith(BASE, BASE_LENGTH)); 776 UChar c = s.charAt(s.length() - 1); 777 if (0x41 <= c && c <= 0x5A) { // A-Z 778 // There are Pinyin labels, add ASCII A-Z labels as well. 779 initialLabels_->add(0x41, 0x5A); // A-Z 780 break; 781 } 782 } 783 return TRUE; 784 } 785 786 787 /* 788 * Return the string with interspersed CGJs. Input must have more than 2 codepoints. 789 */ 790 static const UChar CGJ = 0x034F; 791 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { 792 UnicodeString result; 793 if (item.length() == 0) { 794 return result; 795 } 796 int32_t i = 0; 797 for (;;) { 798 UChar32 cp = item.char32At(i); 799 result.append(cp); 800 i = item.moveIndex32(i, 1); 801 if (i >= item.length()) { 802 break; 803 } 804 result.append(CGJ); 805 } 806 return result; 807 } 808 809 810 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { 811 return FALSE; 812 } 813 814 815 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { 816 return FALSE; 817 } 818 819 820 const RuleBasedCollator &AlphabeticIndex::getCollator() const { 821 return *collator_; 822 } 823 824 825 const UnicodeString &AlphabeticIndex::getInflowLabel() const { 826 return inflowLabel_; 827 } 828 829 const UnicodeString &AlphabeticIndex::getOverflowLabel() const { 830 return overflowLabel_; 831 } 832 833 834 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { 835 return underflowLabel_; 836 } 837 838 839 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 840 inflowLabel_ = label; 841 clearBuckets(); 842 return *this; 843 } 844 845 846 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 847 overflowLabel_ = label; 848 clearBuckets(); 849 return *this; 850 } 851 852 853 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 854 underflowLabel_ = label; 855 clearBuckets(); 856 return *this; 857 } 858 859 860 int32_t AlphabeticIndex::getMaxLabelCount() const { 861 return maxLabelCount_; 862 } 863 864 865 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { 866 if (U_FAILURE(status)) { 867 return *this; 868 } 869 if (maxLabelCount <= 0) { 870 status = U_ILLEGAL_ARGUMENT_ERROR; 871 return *this; 872 } 873 maxLabelCount_ = maxLabelCount; 874 clearBuckets(); 875 return *this; 876 } 877 878 879 // 880 // init() - Common code for constructors. 881 // 882 883 void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { 884 if (U_FAILURE(status)) { return; } 885 if (locale == NULL && collator_ == NULL) { 886 status = U_ILLEGAL_ARGUMENT_ERROR; 887 return; 888 } 889 890 initialLabels_ = new UnicodeSet(); 891 if (initialLabels_ == NULL) { 892 status = U_MEMORY_ALLOCATION_ERROR; 893 return; 894 } 895 896 inflowLabel_.setTo((UChar)0x2026); // Ellipsis 897 overflowLabel_ = inflowLabel_; 898 underflowLabel_ = inflowLabel_; 899 900 if (collator_ == NULL) { 901 Collator *coll = Collator::createInstance(*locale, status); 902 if (U_FAILURE(status)) { 903 delete coll; 904 return; 905 } 906 if (coll == NULL) { 907 status = U_MEMORY_ALLOCATION_ERROR; 908 return; 909 } 910 collator_ = dynamic_cast<RuleBasedCollator *>(coll); 911 if (collator_ == NULL) { 912 delete coll; 913 status = U_UNSUPPORTED_ERROR; 914 return; 915 } 916 } 917 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone()); 918 if (collatorPrimaryOnly_ == NULL) { 919 status = U_MEMORY_ALLOCATION_ERROR; 920 return; 921 } 922 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); 923 firstCharsInScripts_ = firstStringsInScript(status); 924 if (U_FAILURE(status)) { return; } 925 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); 926 // Guard against a degenerate collator where 927 // some script boundary strings are primary ignorable. 928 for (;;) { 929 if (U_FAILURE(status)) { return; } 930 if (firstCharsInScripts_->isEmpty()) { 931 // AlphabeticIndex requires some non-ignorable script boundary strings. 932 status = U_ILLEGAL_ARGUMENT_ERROR; 933 return; 934 } 935 if (collatorPrimaryOnly_->compare( 936 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)), 937 emptyString_, status) == UCOL_EQUAL) { 938 firstCharsInScripts_->removeElementAt(0); 939 } else { 940 break; 941 } 942 } 943 944 // Chinese index characters, which are specific to each of the several Chinese tailorings, 945 // take precedence over the single locale data exemplar set per language. 946 if (!addChineseIndexCharacters(status) && locale != NULL) { 947 addIndexExemplars(*locale, status); 948 } 949 } 950 951 952 // 953 // Comparison function for UVector<UnicodeString *> sorting with a collator. 954 // 955 static int32_t U_CALLCONV 956 collatorComparator(const void *context, const void *left, const void *right) { 957 const UElement *leftElement = static_cast<const UElement *>(left); 958 const UElement *rightElement = static_cast<const UElement *>(right); 959 const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer); 960 const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer); 961 962 if (leftString == rightString) { 963 // Catches case where both are NULL 964 return 0; 965 } 966 if (leftString == NULL) { 967 return 1; 968 }; 969 if (rightString == NULL) { 970 return -1; 971 } 972 const Collator *col = static_cast<const Collator *>(context); 973 UErrorCode errorCode = U_ZERO_ERROR; 974 return col->compare(*leftString, *rightString, errorCode); 975 } 976 977 // 978 // Comparison function for UVector<Record *> sorting with a collator. 979 // 980 static int32_t U_CALLCONV 981 recordCompareFn(const void *context, const void *left, const void *right) { 982 const UElement *leftElement = static_cast<const UElement *>(left); 983 const UElement *rightElement = static_cast<const UElement *>(right); 984 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer); 985 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer); 986 const Collator *col = static_cast<const Collator *>(context); 987 UErrorCode errorCode = U_ZERO_ERROR; 988 return col->compare(leftRec->name_, rightRec->name_, errorCode); 989 } 990 991 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { 992 if (U_FAILURE(status)) { 993 return NULL; 994 } 995 LocalPointer<UVector> dest(new UVector(status)); 996 if (dest.isNull()) { 997 status = U_MEMORY_ALLOCATION_ERROR; 998 return NULL; 999 } 1000 dest->setDeleter(uprv_deleteUObject); 1001 // Fetch the script-first-primary contractions which are defined in the root collator. 1002 // They all start with U+FDD1. 1003 UnicodeSet set; 1004 collatorPrimaryOnly_->internalAddContractions(0xFDD1, set, status); 1005 if (U_FAILURE(status)) { 1006 return NULL; 1007 } 1008 if (set.isEmpty()) { 1009 status = U_UNSUPPORTED_ERROR; 1010 return NULL; 1011 } 1012 UnicodeSetIterator iter(set); 1013 while (iter.next()) { 1014 const UnicodeString &boundary = iter.getString(); 1015 uint32_t gcMask = U_GET_GC_MASK(boundary.char32At(1)); 1016 if ((gcMask & (U_GC_L_MASK | U_GC_CN_MASK)) == 0) { 1017 // Ignore boundaries for the special reordering groups. 1018 // Take only those for "real scripts" (where the sample character is a Letter, 1019 // and the one for unassigned implicit weights (Cn). 1020 continue; 1021 } 1022 UnicodeString *s = new UnicodeString(boundary); 1023 if (s == NULL) { 1024 status = U_MEMORY_ALLOCATION_ERROR; 1025 return NULL; 1026 } 1027 dest->addElement(s, status); 1028 } 1029 return dest.orphan(); 1030 } 1031 1032 1033 namespace { 1034 1035 /** 1036 * Returns true if one index character string is "better" than the other. 1037 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is 1038 * better, and otherwise binary-less-than is better. 1039 */ 1040 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, 1041 const UnicodeString &one, const UnicodeString &other) { 1042 // This is called with primary-equal strings, but never with one.equals(other). 1043 UErrorCode status = U_ZERO_ERROR; 1044 UnicodeString n1 = nfkdNormalizer.normalize(one, status); 1045 UnicodeString n2 = nfkdNormalizer.normalize(other, status); 1046 if (U_FAILURE(status)) { return FALSE; } 1047 int32_t result = n1.countChar32() - n2.countChar32(); 1048 if (result != 0) { 1049 return result < 0; 1050 } 1051 result = n1.compareCodePointOrder(n2); 1052 if (result != 0) { 1053 return result < 0; 1054 } 1055 return one.compareCodePointOrder(other) < 0; 1056 } 1057 1058 } // namespace 1059 1060 // 1061 // Constructor & Destructor for AlphabeticIndex::Record 1062 // 1063 // Records are internal only, instances are not directly surfaced in the public API. 1064 // This class is mostly struct-like, with all public fields. 1065 1066 AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data) 1067 : name_(name), data_(data) {} 1068 1069 AlphabeticIndex::Record::~Record() { 1070 } 1071 1072 1073 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { 1074 if (U_FAILURE(status)) { 1075 return *this; 1076 } 1077 if (inputList_ == NULL) { 1078 inputList_ = new UVector(status); 1079 if (inputList_ == NULL) { 1080 status = U_MEMORY_ALLOCATION_ERROR; 1081 return *this; 1082 } 1083 inputList_->setDeleter(alphaIndex_deleteRecord); 1084 } 1085 Record *r = new Record(name, data); 1086 if (r == NULL) { 1087 status = U_MEMORY_ALLOCATION_ERROR; 1088 return *this; 1089 } 1090 inputList_->addElement(r, status); 1091 clearBuckets(); 1092 //std::string ss; 1093 //std::string ss2; 1094 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << 1095 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; 1096 return *this; 1097 } 1098 1099 1100 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { 1101 if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) { 1102 inputList_->removeAllElements(); 1103 clearBuckets(); 1104 } 1105 return *this; 1106 } 1107 1108 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { 1109 initBuckets(status); 1110 if (U_FAILURE(status)) { 1111 return 0; 1112 } 1113 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status); 1114 } 1115 1116 1117 int32_t AlphabeticIndex::getBucketIndex() const { 1118 return labelsIterIndex_; 1119 } 1120 1121 1122 UBool AlphabeticIndex::nextBucket(UErrorCode &status) { 1123 if (U_FAILURE(status)) { 1124 return FALSE; 1125 } 1126 if (buckets_ == NULL && currentBucket_ != NULL) { 1127 status = U_ENUM_OUT_OF_SYNC_ERROR; 1128 return FALSE; 1129 } 1130 initBuckets(status); 1131 if (U_FAILURE(status)) { 1132 return FALSE; 1133 } 1134 ++labelsIterIndex_; 1135 if (labelsIterIndex_ >= buckets_->getBucketCount()) { 1136 labelsIterIndex_ = buckets_->getBucketCount(); 1137 return FALSE; 1138 } 1139 currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_); 1140 resetRecordIterator(); 1141 return TRUE; 1142 } 1143 1144 const UnicodeString &AlphabeticIndex::getBucketLabel() const { 1145 if (currentBucket_ != NULL) { 1146 return currentBucket_->label_; 1147 } else { 1148 return emptyString_; 1149 } 1150 } 1151 1152 1153 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { 1154 if (currentBucket_ != NULL) { 1155 return currentBucket_->labelType_; 1156 } else { 1157 return U_ALPHAINDEX_NORMAL; 1158 } 1159 } 1160 1161 1162 int32_t AlphabeticIndex::getBucketRecordCount() const { 1163 if (currentBucket_ != NULL && currentBucket_->records_ != NULL) { 1164 return currentBucket_->records_->size(); 1165 } else { 1166 return 0; 1167 } 1168 } 1169 1170 1171 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { 1172 if (U_FAILURE(status)) { 1173 return *this; 1174 } 1175 internalResetBucketIterator(); 1176 return *this; 1177 } 1178 1179 1180 UBool AlphabeticIndex::nextRecord(UErrorCode &status) { 1181 if (U_FAILURE(status)) { 1182 return FALSE; 1183 } 1184 if (currentBucket_ == NULL) { 1185 // We are trying to iterate over the items in a bucket, but there is no 1186 // current bucket from the enumeration of buckets. 1187 status = U_INVALID_STATE_ERROR; 1188 return FALSE; 1189 } 1190 if (buckets_ == NULL) { 1191 status = U_ENUM_OUT_OF_SYNC_ERROR; 1192 return FALSE; 1193 } 1194 if (currentBucket_->records_ == NULL) { 1195 return FALSE; 1196 } 1197 ++itemsIterIndex_; 1198 if (itemsIterIndex_ >= currentBucket_->records_->size()) { 1199 itemsIterIndex_ = currentBucket_->records_->size(); 1200 return FALSE; 1201 } 1202 return TRUE; 1203 } 1204 1205 1206 const UnicodeString &AlphabeticIndex::getRecordName() const { 1207 const UnicodeString *retStr = &emptyString_; 1208 if (currentBucket_ != NULL && currentBucket_->records_ != NULL && 1209 itemsIterIndex_ >= 0 && 1210 itemsIterIndex_ < currentBucket_->records_->size()) { 1211 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); 1212 retStr = &item->name_; 1213 } 1214 return *retStr; 1215 } 1216 1217 const void *AlphabeticIndex::getRecordData() const { 1218 const void *retPtr = NULL; 1219 if (currentBucket_ != NULL && currentBucket_->records_ != NULL && 1220 itemsIterIndex_ >= 0 && 1221 itemsIterIndex_ < currentBucket_->records_->size()) { 1222 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); 1223 retPtr = item->data_; 1224 } 1225 return retPtr; 1226 } 1227 1228 1229 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { 1230 itemsIterIndex_ = -1; 1231 return *this; 1232 } 1233 1234 1235 1236 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, 1237 const UnicodeString &lowerBoundary, 1238 UAlphabeticIndexLabelType type) 1239 : label_(label), lowerBoundary_(lowerBoundary), labelType_(type), 1240 displayBucket_(NULL), displayIndex_(-1), 1241 records_(NULL) { 1242 } 1243 1244 1245 AlphabeticIndex::Bucket::~Bucket() { 1246 delete records_; 1247 } 1248 1249 U_NAMESPACE_END 1250 1251 #endif // !UCONFIG_NO_COLLATION 1252