1 /* 2 ******************************************************************************* 3 * Copyright (C) 2009-2013, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION 11 12 #include "unicode/alphaindex.h" 13 #include "unicode/coleitr.h" 14 #include "unicode/coll.h" 15 #include "unicode/localpointer.h" 16 #include "unicode/normalizer2.h" 17 #include "unicode/tblcoll.h" 18 #include "unicode/ulocdata.h" 19 #include "unicode/uniset.h" 20 #include "unicode/uobject.h" 21 #include "unicode/usetiter.h" 22 #include "unicode/utf16.h" 23 24 #include "cmemory.h" 25 #include "cstring.h" 26 #include "uassert.h" 27 #include "uvector.h" 28 29 //#include <string> 30 //#include <iostream> 31 32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 33 34 U_NAMESPACE_BEGIN 35 36 namespace { 37 38 /** 39 * Prefix string for Chinese index buckets. 40 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes 41 */ 42 const UChar BASE[1] = { 0xFDD0 }; 43 const int32_t BASE_LENGTH = 1; 44 45 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, 46 const UnicodeString &one, const UnicodeString &other); 47 48 } // namespace 49 50 static int32_t U_CALLCONV 51 collatorComparator(const void *context, const void *left, const void *right); 52 53 static int32_t U_CALLCONV 54 recordCompareFn(const void *context, const void *left, const void *right); 55 56 // UVector<Record *> support function, delete a Record. 57 static void U_CALLCONV 58 alphaIndex_deleteRecord(void *obj) { 59 delete static_cast<AlphabeticIndex::Record *>(obj); 60 } 61 62 namespace { 63 64 UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned, 65 UErrorCode &errorCode) { 66 if (U_FAILURE(errorCode)) { return NULL; } 67 if (owned.isValid()) { 68 return owned.orphan(); 69 } 70 UnicodeString *p = new UnicodeString(s); 71 if (p == NULL) { 72 errorCode = U_MEMORY_ALLOCATION_ERROR; 73 } 74 return p; 75 } 76 77 inline UnicodeString *getString(const UVector &list, int32_t i) { 78 return static_cast<UnicodeString *>(list[i]); 79 } 80 81 inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) { 82 return static_cast<AlphabeticIndex::Bucket *>(list[i]); 83 } 84 85 inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) { 86 return static_cast<AlphabeticIndex::Record *>(list[i]); 87 } 88 89 /** 90 * Like Java Collections.binarySearch(List, String, Comparator). 91 * 92 * @return the index>=0 where the item was found, 93 * or the index<0 for inserting the string at ~index in sorted order 94 */ 95 int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) { 96 if (list.size() == 0) { return ~0; } 97 int32_t start = 0; 98 int32_t limit = list.size(); 99 for (;;) { 100 int32_t i = (start + limit) / 2; 101 const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i)); 102 UErrorCode errorCode = U_ZERO_ERROR; 103 UCollationResult cmp = coll.compare(s, *si, errorCode); 104 if (cmp == UCOL_EQUAL) { 105 return i; 106 } else if (cmp < 0) { 107 if (i == start) { 108 return ~start; // insert s before *si 109 } 110 limit = i; 111 } else { 112 if (i == start) { 113 return ~(start + 1); // insert s after *si 114 } 115 start = i; 116 } 117 } 118 } 119 120 } // namespace 121 122 // The BucketList is not in the anonymous namespace because only Clang 123 // seems to support its use in other classes from there. 124 // However, we also don't need U_I18N_API because it is not used from outside the i18n library. 125 class BucketList : public UObject { 126 public: 127 BucketList(UVector *bucketList, UVector *publicBucketList) 128 : bucketList_(bucketList), immutableVisibleList_(publicBucketList) { 129 int32_t displayIndex = 0; 130 for (int32_t i = 0; i < publicBucketList->size(); ++i) { 131 getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++; 132 } 133 } 134 135 // The virtual destructor must not be inline. 136 // See ticket #8454 for details. 137 virtual ~BucketList(); 138 139 int32_t getBucketCount() const { 140 return immutableVisibleList_->size(); 141 } 142 143 int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly, 144 UErrorCode &errorCode) { 145 // binary search 146 int32_t start = 0; 147 int32_t limit = bucketList_->size(); 148 while ((start + 1) < limit) { 149 int32_t i = (start + limit) / 2; 150 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i); 151 UCollationResult nameVsBucket = 152 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode); 153 if (nameVsBucket < 0) { 154 limit = i; 155 } else { 156 start = i; 157 } 158 } 159 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start); 160 if (bucket->displayBucket_ != NULL) { 161 bucket = bucket->displayBucket_; 162 } 163 return bucket->displayIndex_; 164 } 165 166 /** All of the buckets, visible and invisible. */ 167 UVector *bucketList_; 168 /** Just the visible buckets. */ 169 UVector *immutableVisibleList_; 170 }; 171 172 BucketList::~BucketList() { 173 delete bucketList_; 174 if (immutableVisibleList_ != bucketList_) { 175 delete immutableVisibleList_; 176 } 177 } 178 179 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() { 180 delete buckets_; 181 delete collatorPrimaryOnly_; 182 } 183 184 int32_t 185 AlphabeticIndex::ImmutableIndex::getBucketCount() const { 186 return buckets_->getBucketCount(); 187 } 188 189 int32_t 190 AlphabeticIndex::ImmutableIndex::getBucketIndex( 191 const UnicodeString &name, UErrorCode &errorCode) const { 192 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode); 193 } 194 195 const AlphabeticIndex::Bucket * 196 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const { 197 if (0 <= index && index < buckets_->getBucketCount()) { 198 return icu::getBucket(*buckets_->immutableVisibleList_, index); 199 } else { 200 return NULL; 201 } 202 } 203 204 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) 205 : inputList_(NULL), 206 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), 207 maxLabelCount_(99), 208 initialLabels_(NULL), firstCharsInScripts_(NULL), 209 collator_(NULL), collatorPrimaryOnly_(NULL), 210 buckets_(NULL) { 211 init(&locale, status); 212 } 213 214 215 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status) 216 : inputList_(NULL), 217 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), 218 maxLabelCount_(99), 219 initialLabels_(NULL), firstCharsInScripts_(NULL), 220 collator_(collator), collatorPrimaryOnly_(NULL), 221 buckets_(NULL) { 222 init(NULL, status); 223 } 224 225 226 227 AlphabeticIndex::~AlphabeticIndex() { 228 delete collator_; 229 delete collatorPrimaryOnly_; 230 delete firstCharsInScripts_; 231 delete buckets_; 232 delete inputList_; 233 delete initialLabels_; 234 } 235 236 237 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { 238 if (U_FAILURE(status)) { 239 return *this; 240 } 241 initialLabels_->addAll(additions); 242 clearBuckets(); 243 return *this; 244 } 245 246 247 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { 248 addIndexExemplars(locale, status); 249 clearBuckets(); 250 return *this; 251 } 252 253 254 AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) { 255 if (U_FAILURE(errorCode)) { return NULL; } 256 // In C++, the ImmutableIndex must own its copy of the BucketList, 257 // even if it contains no records, for proper memory management. 258 // We could clone the buckets_ if they are not NULL, 259 // but that would be worth it only if this method is called multiple times, 260 // or called after using the old-style bucket iterator API. 261 LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode)); 262 LocalPointer<RuleBasedCollator> coll( 263 static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone())); 264 if (immutableBucketList.isNull() || coll.isNull()) { 265 errorCode = U_MEMORY_ALLOCATION_ERROR; 266 return NULL; 267 } 268 ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias()); 269 if (immIndex == NULL) { 270 errorCode = U_MEMORY_ALLOCATION_ERROR; 271 return NULL; 272 } 273 // The ImmutableIndex adopted its parameter objects. 274 immutableBucketList.orphan(); 275 coll.orphan(); 276 return immIndex; 277 } 278 279 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { 280 initBuckets(status); 281 if (U_FAILURE(status)) { 282 return 0; 283 } 284 return buckets_->getBucketCount(); 285 } 286 287 288 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { 289 if (U_FAILURE(status) || inputList_ == NULL) { 290 return 0; 291 } 292 return inputList_->size(); 293 } 294 295 void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const { 296 const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode); 297 if (U_FAILURE(errorCode)) { return; } 298 299 const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0); 300 const UnicodeString &overflowBoundary = 301 *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1); 302 303 // We make a sorted array of elements. 304 // Some of the input may be redundant. 305 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". 306 // We filter out those cases. 307 UnicodeSetIterator iter(*initialLabels_); 308 while (iter.next()) { 309 const UnicodeString *item = &iter.getString(); 310 LocalPointer<UnicodeString> ownedItem; 311 UBool checkDistinct; 312 int32_t itemLength = item->length(); 313 if (!item->hasMoreChar32Than(0, itemLength, 1)) { 314 checkDistinct = FALSE; 315 } else if(item->charAt(itemLength - 1) == 0x2a && // '*' 316 item->charAt(itemLength - 2) != 0x2a) { 317 // Use a label if it is marked with one trailing star, 318 // even if the label string sorts the same when all contractions are suppressed. 319 ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1)); 320 item = ownedItem.getAlias(); 321 if (item == NULL) { 322 errorCode = U_MEMORY_ALLOCATION_ERROR; 323 return; 324 } 325 checkDistinct = FALSE; 326 } else { 327 checkDistinct = TRUE; 328 } 329 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { 330 // Ignore a primary-ignorable or non-alphabetic index character. 331 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { 332 // Ignore an index characters that will land in the overflow bucket. 333 } else if (checkDistinct && 334 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { 335 // Ignore a multi-code point index character that does not sort distinctly 336 // from the sequence of its separate characters. 337 } else { 338 int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_); 339 if (insertionPoint < 0) { 340 indexCharacters.insertElementAt( 341 ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode); 342 } else { 343 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); 344 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) { 345 indexCharacters.setElementAt( 346 ownedString(*item, ownedItem, errorCode), insertionPoint); 347 } 348 } 349 } 350 } 351 if (U_FAILURE(errorCode)) { return; } 352 353 // if the result is still too large, cut down to maxCount elements, by removing every nth element 354 355 int32_t size = indexCharacters.size() - 1; 356 if (size > maxLabelCount_) { 357 int32_t count = 0; 358 int32_t old = -1; 359 for (int32_t i = 0; i < indexCharacters.size();) { 360 ++count; 361 int32_t bump = count * maxLabelCount_ / size; 362 if (bump == old) { 363 indexCharacters.removeElementAt(i); 364 } else { 365 old = bump; 366 ++i; 367 } 368 } 369 } 370 } 371 372 namespace { 373 374 const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) { 375 if (!current.startsWith(BASE, BASE_LENGTH)) { 376 return current; 377 } 378 UChar rest = current.charAt(BASE_LENGTH); 379 if (0x2800 < rest && rest <= 0x28FF) { // stroke count 380 int32_t count = rest-0x2800; 381 temp.setTo((UChar)(0x30 + count % 10)); 382 if (count >= 10) { 383 count /= 10; 384 temp.insert(0, (UChar)(0x30 + count % 10)); 385 if (count >= 10) { 386 count /= 10; 387 temp.insert(0, (UChar)(0x30 + count)); 388 } 389 } 390 return temp.append((UChar)0x5283); 391 } 392 return temp.setTo(current, BASE_LENGTH); 393 } 394 395 UBool hasMultiplePrimaryWeights( 396 CollationElementIterator &cei, int32_t variableTop, 397 const UnicodeString &s, UErrorCode &errorCode) { 398 cei.setText(s, errorCode); 399 UBool seenPrimary = FALSE; 400 for (;;) { 401 int32_t ce32 = cei.next(errorCode); 402 if (ce32 == CollationElementIterator::NULLORDER) { 403 break; 404 } 405 int32_t p = CollationElementIterator::primaryOrder(ce32); 406 if (p > variableTop && (ce32 & 0xc0) != 0xc0) { 407 // not primary ignorable, and not a continuation CE 408 if (seenPrimary) { 409 return TRUE; 410 } 411 seenPrimary = TRUE; 412 } 413 } 414 return FALSE; 415 } 416 417 } // namespace 418 419 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { 420 // Initialize indexCharacters. 421 UVector indexCharacters(errorCode); 422 indexCharacters.setDeleter(uprv_deleteUObject); 423 initLabels(indexCharacters, errorCode); 424 if (U_FAILURE(errorCode)) { return NULL; } 425 426 // Variables for hasMultiplePrimaryWeights(). 427 LocalPointer<CollationElementIterator> cei( 428 collatorPrimaryOnly_->createCollationElementIterator(emptyString_)); 429 if (cei.isNull()) { 430 errorCode = U_MEMORY_ALLOCATION_ERROR; 431 return NULL; 432 } 433 int32_t variableTop; 434 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) { 435 variableTop = CollationElementIterator::primaryOrder( 436 (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode)); 437 } else { 438 variableTop = 0; 439 } 440 UBool hasInvisibleBuckets = FALSE; 441 442 // Helper arrays for Chinese Pinyin collation. 443 Bucket *asciiBuckets[26] = { 444 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 445 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL 446 }; 447 Bucket *pinyinBuckets[26] = { 448 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 449 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL 450 }; 451 UBool hasPinyin = FALSE; 452 453 LocalPointer<UVector> bucketList(new UVector(errorCode)); 454 if (bucketList.isNull()) { 455 errorCode = U_MEMORY_ALLOCATION_ERROR; 456 return NULL; 457 } 458 bucketList->setDeleter(uprv_deleteUObject); 459 460 // underflow bucket 461 Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW); 462 if (bucket == NULL) { 463 errorCode = U_MEMORY_ALLOCATION_ERROR; 464 return NULL; 465 } 466 bucketList->addElement(bucket, errorCode); 467 if (U_FAILURE(errorCode)) { return NULL; } 468 469 UnicodeString temp; 470 471 // fix up the list, adding underflow, additions, overflow 472 // Insert inflow labels as needed. 473 int32_t scriptIndex = -1; 474 const UnicodeString *scriptUpperBoundary = &emptyString_; 475 for (int32_t i = 0; i < indexCharacters.size(); ++i) { 476 UnicodeString ¤t = *getString(indexCharacters, i); 477 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) { 478 // We crossed the script boundary into a new script. 479 const UnicodeString &inflowBoundary = *scriptUpperBoundary; 480 UBool skippedScript = FALSE; 481 for (;;) { 482 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex); 483 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) { 484 break; 485 } 486 skippedScript = TRUE; 487 } 488 if (skippedScript && bucketList->size() > 1) { 489 // We are skipping one or more scripts, 490 // and we are not just getting out of the underflow label. 491 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW); 492 if (bucket == NULL) { 493 errorCode = U_MEMORY_ALLOCATION_ERROR; 494 return NULL; 495 } 496 bucketList->addElement(bucket, errorCode); 497 } 498 } 499 // Add a bucket with the current label. 500 bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL); 501 if (bucket == NULL) { 502 errorCode = U_MEMORY_ALLOCATION_ERROR; 503 return NULL; 504 } 505 bucketList->addElement(bucket, errorCode); 506 // Remember ASCII and Pinyin buckets for Pinyin redirects. 507 UChar c; 508 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z 509 asciiBuckets[c - 0x41] = bucket; 510 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) && 511 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { 512 pinyinBuckets[c - 0x41] = bucket; 513 hasPinyin = TRUE; 514 } 515 // Check for multiple primary weights. 516 if (!current.startsWith(BASE, BASE_LENGTH) && 517 hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) && 518 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) { 519 // "AE-ligature" or "Sch" etc. 520 for (int32_t i = bucketList->size() - 2;; --i) { 521 Bucket *singleBucket = getBucket(*bucketList, i); 522 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 523 // There is no single-character bucket since the last 524 // underflow or inflow label. 525 break; 526 } 527 if (singleBucket->displayBucket_ == NULL && 528 !hasMultiplePrimaryWeights( 529 *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) { 530 // Add an invisible bucket that redirects strings greater than the expansion 531 // to the previous single-character bucket. 532 // For example, after ... Q R S Sch we add Sch\uFFFF->S 533 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. 534 bucket = new Bucket(emptyString_, 535 UnicodeString(current).append((UChar)0xFFFF), 536 U_ALPHAINDEX_NORMAL); 537 if (bucket == NULL) { 538 errorCode = U_MEMORY_ALLOCATION_ERROR; 539 return NULL; 540 } 541 bucket->displayBucket_ = singleBucket; 542 bucketList->addElement(bucket, errorCode); 543 hasInvisibleBuckets = TRUE; 544 break; 545 } 546 } 547 } 548 } 549 if (U_FAILURE(errorCode)) { return NULL; } 550 if (bucketList->size() == 1) { 551 // No real labels, show only the underflow label. 552 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); 553 if (bl == NULL) { 554 errorCode = U_MEMORY_ALLOCATION_ERROR; 555 return NULL; 556 } 557 bucketList.orphan(); 558 return bl; 559 } 560 // overflow bucket 561 bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW); 562 if (bucket == NULL) { 563 errorCode = U_MEMORY_ALLOCATION_ERROR; 564 return NULL; 565 } 566 bucketList->addElement(bucket, errorCode); // final 567 568 if (hasPinyin) { 569 // Redirect Pinyin buckets. 570 Bucket *asciiBucket = NULL; 571 for (int32_t i = 0; i < 26; ++i) { 572 if (asciiBuckets[i] != NULL) { 573 asciiBucket = asciiBuckets[i]; 574 } 575 if (pinyinBuckets[i] != NULL && asciiBucket != NULL) { 576 pinyinBuckets[i]->displayBucket_ = asciiBucket; 577 hasInvisibleBuckets = TRUE; 578 } 579 } 580 } 581 582 if (U_FAILURE(errorCode)) { return NULL; } 583 if (!hasInvisibleBuckets) { 584 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); 585 if (bl == NULL) { 586 errorCode = U_MEMORY_ALLOCATION_ERROR; 587 return NULL; 588 } 589 bucketList.orphan(); 590 return bl; 591 } 592 // Merge inflow buckets that are visually adjacent. 593 // Iterate backwards: Merge inflow into overflow rather than the other way around. 594 int32_t i = bucketList->size() - 1; 595 Bucket *nextBucket = getBucket(*bucketList, i); 596 while (--i > 0) { 597 bucket = getBucket(*bucketList, i); 598 if (bucket->displayBucket_ != NULL) { 599 continue; // skip invisible buckets 600 } 601 if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) { 602 if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 603 bucket->displayBucket_ = nextBucket; 604 continue; 605 } 606 } 607 nextBucket = bucket; 608 } 609 610 LocalPointer<UVector> publicBucketList(new UVector(errorCode)); 611 if (bucketList.isNull()) { 612 errorCode = U_MEMORY_ALLOCATION_ERROR; 613 return NULL; 614 } 615 // Do not call publicBucketList->setDeleter(): 616 // This vector shares its objects with the bucketList. 617 for (int32_t i = 0; i < bucketList->size(); ++i) { 618 bucket = getBucket(*bucketList, i); 619 if (bucket->displayBucket_ == NULL) { 620 publicBucketList->addElement(bucket, errorCode); 621 } 622 } 623 if (U_FAILURE(errorCode)) { return NULL; } 624 BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias()); 625 if (bl == NULL) { 626 errorCode = U_MEMORY_ALLOCATION_ERROR; 627 return NULL; 628 } 629 bucketList.orphan(); 630 publicBucketList.orphan(); 631 return bl; 632 } 633 634 /** 635 * Creates an index, and buckets and sorts the list of records into the index. 636 */ 637 void AlphabeticIndex::initBuckets(UErrorCode &errorCode) { 638 if (U_FAILURE(errorCode) || buckets_ != NULL) { 639 return; 640 } 641 buckets_ = createBucketList(errorCode); 642 if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) { 643 return; 644 } 645 646 // Sort the records by name. 647 // Stable sort preserves input order of collation duplicates. 648 inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode); 649 650 // Now, we traverse all of the input, which is now sorted. 651 // If the item doesn't go in the current bucket, we find the next bucket that contains it. 652 // This makes the process order n*log(n), since we just sort the list and then do a linear process. 653 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so 654 // we need to improve it for that case. 655 656 Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0); 657 int32_t bucketIndex = 1; 658 Bucket *nextBucket; 659 const UnicodeString *upperBoundary; 660 if (bucketIndex < buckets_->bucketList_->size()) { 661 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); 662 upperBoundary = &nextBucket->lowerBoundary_; 663 } else { 664 nextBucket = NULL; 665 upperBoundary = NULL; 666 } 667 for (int32_t i = 0; i < inputList_->size(); ++i) { 668 Record *r = getRecord(*inputList_, i); 669 // if the current bucket isn't the right one, find the one that is 670 // We have a special flag for the last bucket so that we don't look any further 671 while (upperBoundary != NULL && 672 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) { 673 currentBucket = nextBucket; 674 // now reset the boundary that we compare against 675 if (bucketIndex < buckets_->bucketList_->size()) { 676 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); 677 upperBoundary = &nextBucket->lowerBoundary_; 678 } else { 679 upperBoundary = NULL; 680 } 681 } 682 // now put the record into the bucket. 683 Bucket *bucket = currentBucket; 684 if (bucket->displayBucket_ != NULL) { 685 bucket = bucket->displayBucket_; 686 } 687 if (bucket->records_ == NULL) { 688 bucket->records_ = new UVector(errorCode); 689 if (bucket->records_ == NULL) { 690 errorCode = U_MEMORY_ALLOCATION_ERROR; 691 return; 692 } 693 } 694 bucket->records_->addElement(r, errorCode); 695 } 696 } 697 698 void AlphabeticIndex::clearBuckets() { 699 if (buckets_ != NULL) { 700 delete buckets_; 701 buckets_ = NULL; 702 internalResetBucketIterator(); 703 } 704 } 705 706 void AlphabeticIndex::internalResetBucketIterator() { 707 labelsIterIndex_ = -1; 708 currentBucket_ = NULL; 709 } 710 711 712 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { 713 if (U_FAILURE(status)) { return; } 714 // Chinese index characters, which are specific to each of the several Chinese tailorings, 715 // take precedence over the single locale data exemplar set per language. 716 const char *language = locale.getLanguage(); 717 if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 || 718 uprv_strcmp(language, "ko") == 0) { 719 // TODO: This should be done regardless of the language, but it's expensive. 720 // We should add a Collator function (can be @internal) 721 // to enumerate just the contractions that start with a given code point or string. 722 if (addChineseIndexCharacters(status) || U_FAILURE(status)) { 723 return; 724 } 725 } 726 727 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); 728 if (U_FAILURE(status)) { 729 return; 730 } 731 732 UnicodeSet exemplars; 733 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); 734 if (U_SUCCESS(status)) { 735 initialLabels_->addAll(exemplars); 736 return; 737 } 738 status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR 739 740 // The locale data did not include explicit Index characters. 741 // Synthesize a set of them from the locale's standard exemplar characters. 742 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); 743 if (U_FAILURE(status)) { 744 return; 745 } 746 747 // question: should we add auxiliary exemplars? 748 if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) { 749 exemplars.add(0x61, 0x7A); 750 } 751 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables 752 // cut down to small list 753 exemplars.remove(0xAC00, 0xD7A3). 754 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). 755 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). 756 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). 757 add(0xD30C).add(0xD558); 758 } 759 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block 760 // cut down to small list 761 // make use of the fact that Ethiopic is allocated in 8's, where 762 // the base is 0 mod 8. 763 UnicodeSet ethiopic( 764 UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status); 765 UnicodeSetIterator it(ethiopic); 766 while (it.next() && !it.isString()) { 767 if ((it.getCodepoint() & 0x7) != 0) { 768 exemplars.remove(it.getCodepoint()); 769 } 770 } 771 } 772 773 // Upper-case any that aren't already so. 774 // (We only do this for synthesized index characters.) 775 UnicodeSetIterator it(exemplars); 776 UnicodeString upperC; 777 while (it.next()) { 778 const UnicodeString &exemplarC = it.getString(); 779 upperC = exemplarC; 780 upperC.toUpper(locale); 781 initialLabels_->add(upperC); 782 } 783 } 784 785 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { 786 UnicodeSet contractions; 787 ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(), 788 contractions.toUSet(), NULL, FALSE, &errorCode); 789 if (U_FAILURE(errorCode)) { return FALSE; } 790 UnicodeString firstHanBoundary; 791 UBool hasPinyin = FALSE; 792 UnicodeSetIterator iter(contractions); 793 while (iter.next()) { 794 const UnicodeString &s = iter.getString(); 795 if (s.startsWith(BASE, BASE_LENGTH)) { 796 initialLabels_->add(s); 797 if (firstHanBoundary.isEmpty() || 798 collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) { 799 firstHanBoundary = s; 800 } 801 UChar c = s.charAt(s.length() - 1); 802 if (0x41 <= c && c <= 0x5A) { // A-Z 803 hasPinyin = TRUE; 804 } 805 } 806 } 807 if (hasPinyin) { 808 initialLabels_->add(0x41, 0x5A); // A-Z 809 } 810 if (!firstHanBoundary.isEmpty()) { 811 // The hardcoded list of script boundaries includes U+4E00 812 // which is tailored to not be the first primary 813 // in all Chinese tailorings except "unihan". 814 // Replace U+4E00 with the first boundary string from the tailoring. 815 // TODO: This becomes obsolete when the root collator gets 816 // reliable script-first-primary mappings. 817 int32_t hanIndex = binarySearch( 818 *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_); 819 if (hanIndex >= 0) { 820 UnicodeString *fh = new UnicodeString(firstHanBoundary); 821 if (fh == NULL) { 822 errorCode = U_MEMORY_ALLOCATION_ERROR; 823 return FALSE; 824 } 825 firstCharsInScripts_->setElementAt(fh, hanIndex); 826 } 827 return TRUE; 828 } else { 829 return FALSE; 830 } 831 } 832 833 834 /* 835 * Return the string with interspersed CGJs. Input must have more than 2 codepoints. 836 */ 837 static const UChar CGJ = 0x034F; 838 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { 839 UnicodeString result; 840 if (item.length() == 0) { 841 return result; 842 } 843 int32_t i = 0; 844 for (;;) { 845 UChar32 cp = item.char32At(i); 846 result.append(cp); 847 i = item.moveIndex32(i, 1); 848 if (i >= item.length()) { 849 break; 850 } 851 result.append(CGJ); 852 } 853 return result; 854 } 855 856 857 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { 858 return FALSE; 859 } 860 861 862 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { 863 return FALSE; 864 } 865 866 867 const RuleBasedCollator &AlphabeticIndex::getCollator() const { 868 // There are no known non-RuleBasedCollator collators, and none ever expected. 869 // But, in case that changes, better a null pointer than a wrong type. 870 return *dynamic_cast<RuleBasedCollator *>(collator_); 871 } 872 873 874 const UnicodeString &AlphabeticIndex::getInflowLabel() const { 875 return inflowLabel_; 876 } 877 878 const UnicodeString &AlphabeticIndex::getOverflowLabel() const { 879 return overflowLabel_; 880 } 881 882 883 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { 884 return underflowLabel_; 885 } 886 887 888 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 889 inflowLabel_ = label; 890 clearBuckets(); 891 return *this; 892 } 893 894 895 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 896 overflowLabel_ = label; 897 clearBuckets(); 898 return *this; 899 } 900 901 902 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 903 underflowLabel_ = label; 904 clearBuckets(); 905 return *this; 906 } 907 908 909 int32_t AlphabeticIndex::getMaxLabelCount() const { 910 return maxLabelCount_; 911 } 912 913 914 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { 915 if (U_FAILURE(status)) { 916 return *this; 917 } 918 if (maxLabelCount <= 0) { 919 status = U_ILLEGAL_ARGUMENT_ERROR; 920 return *this; 921 } 922 maxLabelCount_ = maxLabelCount; 923 clearBuckets(); 924 return *this; 925 } 926 927 928 // 929 // init() - Common code for constructors. 930 // 931 932 void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { 933 if (U_FAILURE(status)) { return; } 934 if (locale == NULL && collator_ == NULL) { 935 status = U_ILLEGAL_ARGUMENT_ERROR; 936 return; 937 } 938 939 initialLabels_ = new UnicodeSet(); 940 if (initialLabels_ == NULL) { 941 status = U_MEMORY_ALLOCATION_ERROR; 942 return; 943 } 944 945 inflowLabel_.setTo((UChar)0x2026); // Ellipsis 946 overflowLabel_ = inflowLabel_; 947 underflowLabel_ = inflowLabel_; 948 949 if (collator_ == NULL) { 950 collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status)); 951 if (U_FAILURE(status)) { return; } 952 if (collator_ == NULL) { 953 status = U_MEMORY_ALLOCATION_ERROR; 954 return; 955 } 956 } 957 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone()); 958 if (collatorPrimaryOnly_ == NULL) { 959 status = U_MEMORY_ALLOCATION_ERROR; 960 return; 961 } 962 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); 963 firstCharsInScripts_ = firstStringsInScript(status); 964 if (U_FAILURE(status)) { return; } 965 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); 966 UnicodeString _4E00((UChar)0x4E00); 967 UnicodeString _1100((UChar)0x1100); 968 UnicodeString _1112((UChar)0x1112); 969 if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 && 970 collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) { 971 // The standard Korean tailoring sorts Hanja (Han characters) 972 // as secondary differences from Hangul syllables. 973 // This makes U+4E00 not useful as a Han-script boundary. 974 // TODO: This becomes obsolete when the root collator gets 975 // reliable script-first-primary mappings. 976 int32_t hanIndex = binarySearch( 977 *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_); 978 if (hanIndex >= 0) { 979 firstCharsInScripts_->removeElementAt(hanIndex); 980 } 981 } 982 // Guard against a degenerate collator where 983 // some script boundary strings are primary ignorable. 984 for (;;) { 985 if (U_FAILURE(status)) { return; } 986 if (firstCharsInScripts_->isEmpty()) { 987 // AlphabeticIndex requires some non-ignorable script boundary strings. 988 status = U_ILLEGAL_ARGUMENT_ERROR; 989 return; 990 } 991 if (collatorPrimaryOnly_->compare( 992 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)), 993 emptyString_, status) == UCOL_EQUAL) { 994 firstCharsInScripts_->removeElementAt(0); 995 } else { 996 break; 997 } 998 } 999 1000 if (locale != NULL) { 1001 addIndexExemplars(*locale, status); 1002 } 1003 } 1004 1005 1006 // 1007 // Comparison function for UVector<UnicodeString *> sorting with a collator. 1008 // 1009 static int32_t U_CALLCONV 1010 collatorComparator(const void *context, const void *left, const void *right) { 1011 const UElement *leftElement = static_cast<const UElement *>(left); 1012 const UElement *rightElement = static_cast<const UElement *>(right); 1013 const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer); 1014 const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer); 1015 1016 if (leftString == rightString) { 1017 // Catches case where both are NULL 1018 return 0; 1019 } 1020 if (leftString == NULL) { 1021 return 1; 1022 }; 1023 if (rightString == NULL) { 1024 return -1; 1025 } 1026 const Collator *col = static_cast<const Collator *>(context); 1027 UErrorCode errorCode = U_ZERO_ERROR; 1028 return col->compare(*leftString, *rightString, errorCode); 1029 } 1030 1031 // 1032 // Comparison function for UVector<Record *> sorting with a collator. 1033 // 1034 static int32_t U_CALLCONV 1035 recordCompareFn(const void *context, const void *left, const void *right) { 1036 const UElement *leftElement = static_cast<const UElement *>(left); 1037 const UElement *rightElement = static_cast<const UElement *>(right); 1038 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer); 1039 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer); 1040 const Collator *col = static_cast<const Collator *>(context); 1041 UErrorCode errorCode = U_ZERO_ERROR; 1042 return col->compare(leftRec->name_, rightRec->name_, errorCode); 1043 } 1044 1045 1046 /** 1047 * This list contains one character per script that has the 1048 * lowest primary weight for that script in the root collator. 1049 * This list will be copied and sorted to account for script reordering. 1050 * 1051 * <p>TODO: This is fragile. If the first character of a script is tailored 1052 * so that it does not map to the script's lowest primary weight any more, 1053 * then the buckets will be off. 1054 * There are hacks in the code to handle the known CJK tailorings of U+4E00. 1055 * 1056 * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a. 1057 * 1058 * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in 1059 * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java 1060 */ 1061 static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { 1062 0x41, 0, 0x03B1, 0, 1063 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0, 1064 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0, 1065 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, 1066 0xAAF2, 0, // Meetei Mayek 1067 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, 1068 U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada 1069 U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri 1070 0x1B83, 0, 1071 0xD802, 0xDE00, 0, 0x0E01, 0, 1072 0x0EDE, 0, // Lao 1073 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0, 1074 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, 1075 U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma 1076 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0, 1077 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0, 1078 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0, 1079 U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao 1080 0xD800, 0xDE80, 0, 1081 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0, 1082 0xD801, 0xDC80, 0, 1083 U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng 1084 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0, 1085 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, 1086 U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive 1087 U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs 1088 0x4E00, 0, 1089 // TODO: The overflow bucket's lowerBoundary string should be the 1090 // first item after the last reordering group in the collator's script order. 1091 // This should normally be the first Unicode code point 1092 // that is unassigned (U+0378 in Unicode 6.3) and untailored. 1093 // However, at least up to ICU 51 the Hani reordering group includes 1094 // unassigned code points, 1095 // and there is no stable string for the start of the trailing-weights range. 1096 // The only known string that sorts "high" is U+FFFF. 1097 // When ICU separates Hani vs. unassigned reordering groups, we need to fix this, 1098 // and fix relevant test code. 1099 // Ideally, FractionalUCA.txt will have a "script first primary" 1100 // for unassigned code points. 1101 0xFFFF, 0 1102 }; 1103 1104 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { 1105 if (U_FAILURE(status)) { 1106 return NULL; 1107 } 1108 UVector *dest = new UVector(status); 1109 if (dest == NULL) { 1110 status = U_MEMORY_ALLOCATION_ERROR; 1111 return NULL; 1112 } 1113 dest->setDeleter(uprv_deleteUObject); 1114 const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; 1115 const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS); 1116 do { 1117 if (U_FAILURE(status)) { 1118 return dest; 1119 } 1120 UnicodeString *str = new UnicodeString(src, -1); 1121 if (str == NULL) { 1122 status = U_MEMORY_ALLOCATION_ERROR; 1123 return dest; 1124 } 1125 dest->addElement(str, status); 1126 src += str->length() + 1; 1127 } while (src < limit); 1128 return dest; 1129 } 1130 1131 1132 namespace { 1133 1134 /** 1135 * Returns true if one index character string is "better" than the other. 1136 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is 1137 * better, and otherwise binary-less-than is better. 1138 */ 1139 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, 1140 const UnicodeString &one, const UnicodeString &other) { 1141 // This is called with primary-equal strings, but never with one.equals(other). 1142 UErrorCode status = U_ZERO_ERROR; 1143 UnicodeString n1 = nfkdNormalizer.normalize(one, status); 1144 UnicodeString n2 = nfkdNormalizer.normalize(other, status); 1145 if (U_FAILURE(status)) { return FALSE; } 1146 int32_t result = n1.countChar32() - n2.countChar32(); 1147 if (result != 0) { 1148 return result < 0; 1149 } 1150 result = n1.compareCodePointOrder(n2); 1151 if (result != 0) { 1152 return result < 0; 1153 } 1154 return one.compareCodePointOrder(other) < 0; 1155 } 1156 1157 } // namespace 1158 1159 // 1160 // Constructor & Destructor for AlphabeticIndex::Record 1161 // 1162 // Records are internal only, instances are not directly surfaced in the public API. 1163 // This class is mostly struct-like, with all public fields. 1164 1165 AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data) 1166 : name_(name), data_(data) {} 1167 1168 AlphabeticIndex::Record::~Record() { 1169 } 1170 1171 1172 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { 1173 if (U_FAILURE(status)) { 1174 return *this; 1175 } 1176 if (inputList_ == NULL) { 1177 inputList_ = new UVector(status); 1178 if (inputList_ == NULL) { 1179 status = U_MEMORY_ALLOCATION_ERROR; 1180 return *this; 1181 } 1182 inputList_->setDeleter(alphaIndex_deleteRecord); 1183 } 1184 Record *r = new Record(name, data); 1185 if (r == NULL) { 1186 status = U_MEMORY_ALLOCATION_ERROR; 1187 return *this; 1188 } 1189 inputList_->addElement(r, status); 1190 clearBuckets(); 1191 //std::string ss; 1192 //std::string ss2; 1193 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << 1194 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; 1195 return *this; 1196 } 1197 1198 1199 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { 1200 if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) { 1201 inputList_->removeAllElements(); 1202 clearBuckets(); 1203 } 1204 return *this; 1205 } 1206 1207 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { 1208 initBuckets(status); 1209 if (U_FAILURE(status)) { 1210 return 0; 1211 } 1212 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status); 1213 } 1214 1215 1216 int32_t AlphabeticIndex::getBucketIndex() const { 1217 return labelsIterIndex_; 1218 } 1219 1220 1221 UBool AlphabeticIndex::nextBucket(UErrorCode &status) { 1222 if (U_FAILURE(status)) { 1223 return FALSE; 1224 } 1225 if (buckets_ == NULL && currentBucket_ != NULL) { 1226 status = U_ENUM_OUT_OF_SYNC_ERROR; 1227 return FALSE; 1228 } 1229 initBuckets(status); 1230 if (U_FAILURE(status)) { 1231 return FALSE; 1232 } 1233 ++labelsIterIndex_; 1234 if (labelsIterIndex_ >= buckets_->getBucketCount()) { 1235 labelsIterIndex_ = buckets_->getBucketCount(); 1236 return FALSE; 1237 } 1238 currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_); 1239 resetRecordIterator(); 1240 return TRUE; 1241 } 1242 1243 const UnicodeString &AlphabeticIndex::getBucketLabel() const { 1244 if (currentBucket_ != NULL) { 1245 return currentBucket_->label_; 1246 } else { 1247 return emptyString_; 1248 } 1249 } 1250 1251 1252 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { 1253 if (currentBucket_ != NULL) { 1254 return currentBucket_->labelType_; 1255 } else { 1256 return U_ALPHAINDEX_NORMAL; 1257 } 1258 } 1259 1260 1261 int32_t AlphabeticIndex::getBucketRecordCount() const { 1262 if (currentBucket_ != NULL && currentBucket_->records_ != NULL) { 1263 return currentBucket_->records_->size(); 1264 } else { 1265 return 0; 1266 } 1267 } 1268 1269 1270 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { 1271 if (U_FAILURE(status)) { 1272 return *this; 1273 } 1274 internalResetBucketIterator(); 1275 return *this; 1276 } 1277 1278 1279 UBool AlphabeticIndex::nextRecord(UErrorCode &status) { 1280 if (U_FAILURE(status)) { 1281 return FALSE; 1282 } 1283 if (currentBucket_ == NULL) { 1284 // We are trying to iterate over the items in a bucket, but there is no 1285 // current bucket from the enumeration of buckets. 1286 status = U_INVALID_STATE_ERROR; 1287 return FALSE; 1288 } 1289 if (buckets_ == NULL) { 1290 status = U_ENUM_OUT_OF_SYNC_ERROR; 1291 return FALSE; 1292 } 1293 if (currentBucket_->records_ == NULL) { 1294 return FALSE; 1295 } 1296 ++itemsIterIndex_; 1297 if (itemsIterIndex_ >= currentBucket_->records_->size()) { 1298 itemsIterIndex_ = currentBucket_->records_->size(); 1299 return FALSE; 1300 } 1301 return TRUE; 1302 } 1303 1304 1305 const UnicodeString &AlphabeticIndex::getRecordName() const { 1306 const UnicodeString *retStr = &emptyString_; 1307 if (currentBucket_ != NULL && currentBucket_->records_ != NULL && 1308 itemsIterIndex_ >= 0 && 1309 itemsIterIndex_ < currentBucket_->records_->size()) { 1310 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); 1311 retStr = &item->name_; 1312 } 1313 return *retStr; 1314 } 1315 1316 const void *AlphabeticIndex::getRecordData() const { 1317 const void *retPtr = NULL; 1318 if (currentBucket_ != NULL && currentBucket_->records_ != NULL && 1319 itemsIterIndex_ >= 0 && 1320 itemsIterIndex_ < currentBucket_->records_->size()) { 1321 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); 1322 retPtr = item->data_; 1323 } 1324 return retPtr; 1325 } 1326 1327 1328 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { 1329 itemsIterIndex_ = -1; 1330 return *this; 1331 } 1332 1333 1334 1335 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, 1336 const UnicodeString &lowerBoundary, 1337 UAlphabeticIndexLabelType type) 1338 : label_(label), lowerBoundary_(lowerBoundary), labelType_(type), 1339 displayBucket_(NULL), displayIndex_(-1), 1340 records_(NULL) { 1341 } 1342 1343 1344 AlphabeticIndex::Bucket::~Bucket() { 1345 delete records_; 1346 } 1347 1348 U_NAMESPACE_END 1349 1350 #endif 1351