1 /* 2 ******************************************************************************* 3 * Copyright (C) 2009-2013, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION 11 12 #include "unicode/alphaindex.h" 13 #include "unicode/coleitr.h" 14 #include "unicode/coll.h" 15 #include "unicode/localpointer.h" 16 #include "unicode/normalizer2.h" 17 #include "unicode/tblcoll.h" 18 #include "unicode/ulocdata.h" 19 #include "unicode/uniset.h" 20 #include "unicode/uobject.h" 21 #include "unicode/usetiter.h" 22 #include "unicode/utf16.h" 23 24 #include "cmemory.h" 25 #include "cstring.h" 26 #include "uassert.h" 27 #include "uvector.h" 28 29 //#include <string> 30 //#include <iostream> 31 32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 33 34 U_NAMESPACE_BEGIN 35 36 namespace { 37 38 /** 39 * Prefix string for Chinese index buckets. 40 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes 41 */ 42 const UChar BASE[1] = { 0xFDD0 }; 43 const int32_t BASE_LENGTH = 1; 44 45 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, 46 const UnicodeString &one, const UnicodeString &other); 47 48 } // namespace 49 50 static int32_t U_CALLCONV 51 collatorComparator(const void *context, const void *left, const void *right); 52 53 static int32_t U_CALLCONV 54 recordCompareFn(const void *context, const void *left, const void *right); 55 56 // UVector<Record *> support function, delete a Record. 57 static void U_CALLCONV 58 alphaIndex_deleteRecord(void *obj) { 59 delete static_cast<AlphabeticIndex::Record *>(obj); 60 } 61 62 namespace { 63 64 UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned, 65 UErrorCode &errorCode) { 66 if (U_FAILURE(errorCode)) { return NULL; } 67 if (owned.isValid()) { 68 return owned.orphan(); 69 } 70 UnicodeString *p = new UnicodeString(s); 71 if (p == NULL) { 72 errorCode = U_MEMORY_ALLOCATION_ERROR; 73 } 74 return p; 75 } 76 77 inline UnicodeString *getString(const UVector &list, int32_t i) { 78 return static_cast<UnicodeString *>(list[i]); 79 } 80 81 inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) { 82 return static_cast<AlphabeticIndex::Bucket *>(list[i]); 83 } 84 85 inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) { 86 return static_cast<AlphabeticIndex::Record *>(list[i]); 87 } 88 89 /** 90 * Like Java Collections.binarySearch(List, String, Comparator). 91 * 92 * @return the index>=0 where the item was found, 93 * or the index<0 for inserting the string at ~index in sorted order 94 */ 95 int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) { 96 if (list.size() == 0) { return ~0; } 97 int32_t start = 0; 98 int32_t limit = list.size(); 99 for (;;) { 100 int32_t i = (start + limit) / 2; 101 const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i)); 102 UErrorCode errorCode = U_ZERO_ERROR; 103 UCollationResult cmp = coll.compare(s, *si, errorCode); 104 if (cmp == UCOL_EQUAL) { 105 return i; 106 } else if (cmp < 0) { 107 if (i == start) { 108 return ~start; // insert s before *si 109 } 110 limit = i; 111 } else { 112 if (i == start) { 113 return ~(start + 1); // insert s after *si 114 } 115 start = i; 116 } 117 } 118 } 119 120 } // namespace 121 122 // The BucketList is not in the anonymous namespace because only Clang 123 // seems to support its use in other classes from there. 124 // However, we also don't need U_I18N_API because it is not used from outside the i18n library. 125 class BucketList : public UObject { 126 public: 127 BucketList(UVector *bucketList, UVector *publicBucketList) 128 : bucketList_(bucketList), immutableVisibleList_(publicBucketList) { 129 int32_t displayIndex = 0; 130 for (int32_t i = 0; i < publicBucketList->size(); ++i) { 131 getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++; 132 } 133 } 134 135 // The virtual destructor must not be inline. 136 // See ticket #8454 for details. 137 virtual ~BucketList(); 138 139 int32_t getBucketCount() const { 140 return immutableVisibleList_->size(); 141 } 142 143 int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly, 144 UErrorCode &errorCode) { 145 // binary search 146 int32_t start = 0; 147 int32_t limit = bucketList_->size(); 148 while ((start + 1) < limit) { 149 int32_t i = (start + limit) / 2; 150 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i); 151 UCollationResult nameVsBucket = 152 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode); 153 if (nameVsBucket < 0) { 154 limit = i; 155 } else { 156 start = i; 157 } 158 } 159 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start); 160 if (bucket->displayBucket_ != NULL) { 161 bucket = bucket->displayBucket_; 162 } 163 return bucket->displayIndex_; 164 } 165 166 /** All of the buckets, visible and invisible. */ 167 UVector *bucketList_; 168 /** Just the visible buckets. */ 169 UVector *immutableVisibleList_; 170 }; 171 172 BucketList::~BucketList() { 173 delete bucketList_; 174 if (immutableVisibleList_ != bucketList_) { 175 delete immutableVisibleList_; 176 } 177 } 178 179 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() { 180 delete buckets_; 181 delete collatorPrimaryOnly_; 182 } 183 184 int32_t 185 AlphabeticIndex::ImmutableIndex::getBucketCount() const { 186 return buckets_->getBucketCount(); 187 } 188 189 int32_t 190 AlphabeticIndex::ImmutableIndex::getBucketIndex( 191 const UnicodeString &name, UErrorCode &errorCode) const { 192 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode); 193 } 194 195 const AlphabeticIndex::Bucket * 196 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const { 197 if (0 <= index && index < buckets_->getBucketCount()) { 198 return icu::getBucket(*buckets_->immutableVisibleList_, index); 199 } else { 200 return NULL; 201 } 202 } 203 204 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) 205 : inputList_(NULL), 206 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), 207 maxLabelCount_(99), 208 initialLabels_(NULL), firstCharsInScripts_(NULL), 209 collator_(NULL), collatorPrimaryOnly_(NULL), 210 buckets_(NULL) { 211 init(&locale, status); 212 } 213 214 215 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status) 216 : inputList_(NULL), 217 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), 218 maxLabelCount_(99), 219 initialLabels_(NULL), firstCharsInScripts_(NULL), 220 collator_(collator), collatorPrimaryOnly_(NULL), 221 buckets_(NULL) { 222 init(NULL, status); 223 } 224 225 226 227 AlphabeticIndex::~AlphabeticIndex() { 228 delete collator_; 229 delete collatorPrimaryOnly_; 230 delete firstCharsInScripts_; 231 delete buckets_; 232 delete inputList_; 233 delete initialLabels_; 234 } 235 236 237 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { 238 if (U_FAILURE(status)) { 239 return *this; 240 } 241 initialLabels_->addAll(additions); 242 clearBuckets(); 243 return *this; 244 } 245 246 247 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { 248 addIndexExemplars(&locale, status); 249 clearBuckets(); 250 return *this; 251 } 252 253 254 AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) { 255 if (U_FAILURE(errorCode)) { return NULL; } 256 // In C++, the ImmutableIndex must own its copy of the BucketList, 257 // even if it contains no records, for proper memory management. 258 // We could clone the buckets_ if they are not NULL, 259 // but that would be worth it only if this method is called multiple times, 260 // or called after using the old-style bucket iterator API. 261 LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode)); 262 LocalPointer<RuleBasedCollator> coll( 263 static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone())); 264 if (immutableBucketList.isNull() || coll.isNull()) { 265 errorCode = U_MEMORY_ALLOCATION_ERROR; 266 return NULL; 267 } 268 ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias()); 269 if (immIndex == NULL) { 270 errorCode = U_MEMORY_ALLOCATION_ERROR; 271 return NULL; 272 } 273 // The ImmutableIndex adopted its parameter objects. 274 immutableBucketList.orphan(); 275 coll.orphan(); 276 return immIndex; 277 } 278 279 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { 280 initBuckets(status); 281 if (U_FAILURE(status)) { 282 return 0; 283 } 284 return buckets_->getBucketCount(); 285 } 286 287 288 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { 289 if (U_FAILURE(status) || inputList_ == NULL) { 290 return 0; 291 } 292 return inputList_->size(); 293 } 294 295 void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const { 296 const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode); 297 if (U_FAILURE(errorCode)) { return; } 298 299 const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0); 300 const UnicodeString &overflowBoundary = 301 *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1); 302 303 // We make a sorted array of elements. 304 // Some of the input may be redundant. 305 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". 306 // We filter out those cases. 307 UnicodeSetIterator iter(*initialLabels_); 308 while (iter.next()) { 309 const UnicodeString *item = &iter.getString(); 310 LocalPointer<UnicodeString> ownedItem; 311 UBool checkDistinct; 312 int32_t itemLength = item->length(); 313 if (!item->hasMoreChar32Than(0, itemLength, 1)) { 314 checkDistinct = FALSE; 315 } else if(item->charAt(itemLength - 1) == 0x2a && // '*' 316 item->charAt(itemLength - 2) != 0x2a) { 317 // Use a label if it is marked with one trailing star, 318 // even if the label string sorts the same when all contractions are suppressed. 319 ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1)); 320 item = ownedItem.getAlias(); 321 if (item == NULL) { 322 errorCode = U_MEMORY_ALLOCATION_ERROR; 323 return; 324 } 325 checkDistinct = FALSE; 326 } else { 327 checkDistinct = TRUE; 328 } 329 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { 330 // Ignore a primary-ignorable or non-alphabetic index character. 331 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { 332 // Ignore an index characters that will land in the overflow bucket. 333 } else if (checkDistinct && 334 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { 335 // Ignore a multi-code point index character that does not sort distinctly 336 // from the sequence of its separate characters. 337 } else { 338 int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_); 339 if (insertionPoint < 0) { 340 indexCharacters.insertElementAt( 341 ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode); 342 } else { 343 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); 344 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) { 345 indexCharacters.setElementAt( 346 ownedString(*item, ownedItem, errorCode), insertionPoint); 347 } 348 } 349 } 350 } 351 if (U_FAILURE(errorCode)) { return; } 352 353 // if the result is still too large, cut down to maxCount elements, by removing every nth element 354 355 int32_t size = indexCharacters.size() - 1; 356 if (size > maxLabelCount_) { 357 int32_t count = 0; 358 int32_t old = -1; 359 for (int32_t i = 0; i < indexCharacters.size();) { 360 ++count; 361 int32_t bump = count * maxLabelCount_ / size; 362 if (bump == old) { 363 indexCharacters.removeElementAt(i); 364 } else { 365 old = bump; 366 ++i; 367 } 368 } 369 } 370 } 371 372 namespace { 373 374 const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) { 375 if (!current.startsWith(BASE, BASE_LENGTH)) { 376 return current; 377 } 378 UChar rest = current.charAt(BASE_LENGTH); 379 if (0x2800 < rest && rest <= 0x28FF) { // stroke count 380 int32_t count = rest-0x2800; 381 temp.setTo((UChar)(0x30 + count % 10)); 382 if (count >= 10) { 383 count /= 10; 384 temp.insert(0, (UChar)(0x30 + count % 10)); 385 if (count >= 10) { 386 count /= 10; 387 temp.insert(0, (UChar)(0x30 + count)); 388 } 389 } 390 return temp.append((UChar)0x5283); 391 } 392 return temp.setTo(current, BASE_LENGTH); 393 } 394 395 UBool hasMultiplePrimaryWeights( 396 CollationElementIterator &cei, int32_t variableTop, 397 const UnicodeString &s, UErrorCode &errorCode) { 398 cei.setText(s, errorCode); 399 UBool seenPrimary = FALSE; 400 for (;;) { 401 int32_t ce32 = cei.next(errorCode); 402 if (ce32 == CollationElementIterator::NULLORDER) { 403 break; 404 } 405 int32_t p = CollationElementIterator::primaryOrder(ce32); 406 if (p > variableTop && (ce32 & 0xc0) != 0xc0) { 407 // not primary ignorable, and not a continuation CE 408 if (seenPrimary) { 409 return TRUE; 410 } 411 seenPrimary = TRUE; 412 } 413 } 414 return FALSE; 415 } 416 417 } // namespace 418 419 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { 420 // Initialize indexCharacters. 421 UVector indexCharacters(errorCode); 422 indexCharacters.setDeleter(uprv_deleteUObject); 423 initLabels(indexCharacters, errorCode); 424 if (U_FAILURE(errorCode)) { return NULL; } 425 426 // Variables for hasMultiplePrimaryWeights(). 427 LocalPointer<CollationElementIterator> cei( 428 collatorPrimaryOnly_->createCollationElementIterator(emptyString_)); 429 if (cei.isNull()) { 430 errorCode = U_MEMORY_ALLOCATION_ERROR; 431 return NULL; 432 } 433 int32_t variableTop; 434 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) { 435 variableTop = CollationElementIterator::primaryOrder( 436 (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode)); 437 } else { 438 variableTop = 0; 439 } 440 UBool hasInvisibleBuckets = FALSE; 441 442 // Helper arrays for Chinese Pinyin collation. 443 Bucket *asciiBuckets[26] = { 444 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 445 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL 446 }; 447 Bucket *pinyinBuckets[26] = { 448 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 449 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL 450 }; 451 UBool hasPinyin = FALSE; 452 453 LocalPointer<UVector> bucketList(new UVector(errorCode)); 454 if (bucketList.isNull()) { 455 errorCode = U_MEMORY_ALLOCATION_ERROR; 456 return NULL; 457 } 458 bucketList->setDeleter(uprv_deleteUObject); 459 460 // underflow bucket 461 Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW); 462 if (bucket == NULL) { 463 errorCode = U_MEMORY_ALLOCATION_ERROR; 464 return NULL; 465 } 466 bucketList->addElement(bucket, errorCode); 467 if (U_FAILURE(errorCode)) { return NULL; } 468 469 UnicodeString temp; 470 471 // fix up the list, adding underflow, additions, overflow 472 // Insert inflow labels as needed. 473 int32_t scriptIndex = -1; 474 const UnicodeString *scriptUpperBoundary = &emptyString_; 475 for (int32_t i = 0; i < indexCharacters.size(); ++i) { 476 UnicodeString ¤t = *getString(indexCharacters, i); 477 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) { 478 // We crossed the script boundary into a new script. 479 const UnicodeString &inflowBoundary = *scriptUpperBoundary; 480 UBool skippedScript = FALSE; 481 for (;;) { 482 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex); 483 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) { 484 break; 485 } 486 skippedScript = TRUE; 487 } 488 if (skippedScript && bucketList->size() > 1) { 489 // We are skipping one or more scripts, 490 // and we are not just getting out of the underflow label. 491 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW); 492 if (bucket == NULL) { 493 errorCode = U_MEMORY_ALLOCATION_ERROR; 494 return NULL; 495 } 496 bucketList->addElement(bucket, errorCode); 497 } 498 } 499 // Add a bucket with the current label. 500 bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL); 501 if (bucket == NULL) { 502 errorCode = U_MEMORY_ALLOCATION_ERROR; 503 return NULL; 504 } 505 bucketList->addElement(bucket, errorCode); 506 // Remember ASCII and Pinyin buckets for Pinyin redirects. 507 UChar c; 508 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z 509 asciiBuckets[c - 0x41] = bucket; 510 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) && 511 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { 512 pinyinBuckets[c - 0x41] = bucket; 513 hasPinyin = TRUE; 514 } 515 // Check for multiple primary weights. 516 if (!current.startsWith(BASE, BASE_LENGTH) && 517 hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) && 518 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) { 519 // "AE-ligature" or "Sch" etc. 520 for (int32_t i = bucketList->size() - 2;; --i) { 521 Bucket *singleBucket = getBucket(*bucketList, i); 522 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 523 // There is no single-character bucket since the last 524 // underflow or inflow label. 525 break; 526 } 527 if (singleBucket->displayBucket_ == NULL && 528 !hasMultiplePrimaryWeights( 529 *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) { 530 // Add an invisible bucket that redirects strings greater than the expansion 531 // to the previous single-character bucket. 532 // For example, after ... Q R S Sch we add Sch\uFFFF->S 533 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. 534 bucket = new Bucket(emptyString_, 535 UnicodeString(current).append((UChar)0xFFFF), 536 U_ALPHAINDEX_NORMAL); 537 if (bucket == NULL) { 538 errorCode = U_MEMORY_ALLOCATION_ERROR; 539 return NULL; 540 } 541 bucket->displayBucket_ = singleBucket; 542 bucketList->addElement(bucket, errorCode); 543 hasInvisibleBuckets = TRUE; 544 break; 545 } 546 } 547 } 548 } 549 if (U_FAILURE(errorCode)) { return NULL; } 550 if (bucketList->size() == 1) { 551 // No real labels, show only the underflow label. 552 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); 553 if (bl == NULL) { 554 errorCode = U_MEMORY_ALLOCATION_ERROR; 555 return NULL; 556 } 557 bucketList.orphan(); 558 return bl; 559 } 560 // overflow bucket 561 bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW); 562 if (bucket == NULL) { 563 errorCode = U_MEMORY_ALLOCATION_ERROR; 564 return NULL; 565 } 566 bucketList->addElement(bucket, errorCode); // final 567 568 if (hasPinyin) { 569 // Redirect Pinyin buckets. 570 Bucket *asciiBucket = NULL; 571 for (int32_t i = 0; i < 26; ++i) { 572 if (asciiBuckets[i] != NULL) { 573 asciiBucket = asciiBuckets[i]; 574 } 575 if (pinyinBuckets[i] != NULL && asciiBucket != NULL) { 576 pinyinBuckets[i]->displayBucket_ = asciiBucket; 577 hasInvisibleBuckets = TRUE; 578 } 579 } 580 } 581 582 if (U_FAILURE(errorCode)) { return NULL; } 583 if (!hasInvisibleBuckets) { 584 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); 585 if (bl == NULL) { 586 errorCode = U_MEMORY_ALLOCATION_ERROR; 587 return NULL; 588 } 589 bucketList.orphan(); 590 return bl; 591 } 592 // Merge inflow buckets that are visually adjacent. 593 // Iterate backwards: Merge inflow into overflow rather than the other way around. 594 int32_t i = bucketList->size() - 1; 595 Bucket *nextBucket = getBucket(*bucketList, i); 596 while (--i > 0) { 597 bucket = getBucket(*bucketList, i); 598 if (bucket->displayBucket_ != NULL) { 599 continue; // skip invisible buckets 600 } 601 if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) { 602 if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 603 bucket->displayBucket_ = nextBucket; 604 continue; 605 } 606 } 607 nextBucket = bucket; 608 } 609 610 LocalPointer<UVector> publicBucketList(new UVector(errorCode)); 611 if (bucketList.isNull()) { 612 errorCode = U_MEMORY_ALLOCATION_ERROR; 613 return NULL; 614 } 615 // Do not call publicBucketList->setDeleter(): 616 // This vector shares its objects with the bucketList. 617 for (int32_t i = 0; i < bucketList->size(); ++i) { 618 bucket = getBucket(*bucketList, i); 619 if (bucket->displayBucket_ == NULL) { 620 publicBucketList->addElement(bucket, errorCode); 621 } 622 } 623 if (U_FAILURE(errorCode)) { return NULL; } 624 BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias()); 625 if (bl == NULL) { 626 errorCode = U_MEMORY_ALLOCATION_ERROR; 627 return NULL; 628 } 629 bucketList.orphan(); 630 publicBucketList.orphan(); 631 return bl; 632 } 633 634 /** 635 * Creates an index, and buckets and sorts the list of records into the index. 636 */ 637 void AlphabeticIndex::initBuckets(UErrorCode &errorCode) { 638 if (U_FAILURE(errorCode) || buckets_ != NULL) { 639 return; 640 } 641 buckets_ = createBucketList(errorCode); 642 if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) { 643 return; 644 } 645 646 // Sort the records by name. 647 // Stable sort preserves input order of collation duplicates. 648 inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode); 649 650 // Now, we traverse all of the input, which is now sorted. 651 // If the item doesn't go in the current bucket, we find the next bucket that contains it. 652 // This makes the process order n*log(n), since we just sort the list and then do a linear process. 653 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so 654 // we need to improve it for that case. 655 656 Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0); 657 int32_t bucketIndex = 1; 658 Bucket *nextBucket; 659 const UnicodeString *upperBoundary; 660 if (bucketIndex < buckets_->bucketList_->size()) { 661 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); 662 upperBoundary = &nextBucket->lowerBoundary_; 663 } else { 664 nextBucket = NULL; 665 upperBoundary = NULL; 666 } 667 for (int32_t i = 0; i < inputList_->size(); ++i) { 668 Record *r = getRecord(*inputList_, i); 669 // if the current bucket isn't the right one, find the one that is 670 // We have a special flag for the last bucket so that we don't look any further 671 while (upperBoundary != NULL && 672 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) { 673 currentBucket = nextBucket; 674 // now reset the boundary that we compare against 675 if (bucketIndex < buckets_->bucketList_->size()) { 676 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); 677 upperBoundary = &nextBucket->lowerBoundary_; 678 } else { 679 upperBoundary = NULL; 680 } 681 } 682 // now put the record into the bucket. 683 Bucket *bucket = currentBucket; 684 if (bucket->displayBucket_ != NULL) { 685 bucket = bucket->displayBucket_; 686 } 687 if (bucket->records_ == NULL) { 688 bucket->records_ = new UVector(errorCode); 689 if (bucket->records_ == NULL) { 690 errorCode = U_MEMORY_ALLOCATION_ERROR; 691 return; 692 } 693 } 694 bucket->records_->addElement(r, errorCode); 695 } 696 } 697 698 void AlphabeticIndex::clearBuckets() { 699 if (buckets_ != NULL) { 700 delete buckets_; 701 buckets_ = NULL; 702 internalResetBucketIterator(); 703 } 704 } 705 706 void AlphabeticIndex::internalResetBucketIterator() { 707 labelsIterIndex_ = -1; 708 currentBucket_ = NULL; 709 } 710 711 712 void AlphabeticIndex::addIndexExemplars(const Locale *locale, UErrorCode &status) { 713 if (U_FAILURE(status)) { return; } 714 // Chinese index characters, which are specific to each of the several Chinese tailorings, 715 // take precedence over the single locale data exemplar set per language. 716 const char *language = locale == NULL ? NULL : locale->getLanguage(); 717 if (language == NULL || 718 uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 || 719 uprv_strcmp(language, "ko") == 0) { 720 // TODO: This should be done regardless of the language, but it's expensive. 721 // We should add a Collator function (can be @internal) 722 // to enumerate just the contractions that start with a given code point or string. 723 if (addChineseIndexCharacters(status) || U_FAILURE(status)) { 724 return; 725 } 726 } 727 if (locale == NULL) { return; } 728 729 LocalULocaleDataPointer uld(ulocdata_open(locale->getName(), &status)); 730 if (U_FAILURE(status)) { 731 return; 732 } 733 734 UnicodeSet exemplars; 735 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); 736 if (U_SUCCESS(status)) { 737 initialLabels_->addAll(exemplars); 738 return; 739 } 740 status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR 741 742 // The locale data did not include explicit Index characters. 743 // Synthesize a set of them from the locale's standard exemplar characters. 744 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); 745 if (U_FAILURE(status)) { 746 return; 747 } 748 749 // question: should we add auxiliary exemplars? 750 if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) { 751 exemplars.add(0x61, 0x7A); 752 } 753 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables 754 // cut down to small list 755 exemplars.remove(0xAC00, 0xD7A3). 756 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). 757 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). 758 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). 759 add(0xD30C).add(0xD558); 760 } 761 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block 762 // cut down to small list 763 // make use of the fact that Ethiopic is allocated in 8's, where 764 // the base is 0 mod 8. 765 UnicodeSet ethiopic( 766 UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status); 767 UnicodeSetIterator it(ethiopic); 768 while (it.next() && !it.isString()) { 769 if ((it.getCodepoint() & 0x7) != 0) { 770 exemplars.remove(it.getCodepoint()); 771 } 772 } 773 } 774 775 // Upper-case any that aren't already so. 776 // (We only do this for synthesized index characters.) 777 UnicodeSetIterator it(exemplars); 778 UnicodeString upperC; 779 while (it.next()) { 780 const UnicodeString &exemplarC = it.getString(); 781 upperC = exemplarC; 782 upperC.toUpper(*locale); 783 initialLabels_->add(upperC); 784 } 785 } 786 787 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { 788 UnicodeSet contractions; 789 ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(), 790 contractions.toUSet(), NULL, FALSE, &errorCode); 791 if (U_FAILURE(errorCode)) { return FALSE; } 792 UnicodeString firstHanBoundary; 793 UBool hasPinyin = FALSE; 794 UnicodeSetIterator iter(contractions); 795 while (iter.next()) { 796 const UnicodeString &s = iter.getString(); 797 if (s.startsWith(BASE, BASE_LENGTH)) { 798 initialLabels_->add(s); 799 if (firstHanBoundary.isEmpty() || 800 collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) { 801 firstHanBoundary = s; 802 } 803 UChar c = s.charAt(s.length() - 1); 804 if (0x41 <= c && c <= 0x5A) { // A-Z 805 hasPinyin = TRUE; 806 } 807 } 808 } 809 if (hasPinyin) { 810 initialLabels_->add(0x41, 0x5A); // A-Z 811 } 812 if (!firstHanBoundary.isEmpty()) { 813 // The hardcoded list of script boundaries includes U+4E00 814 // which is tailored to not be the first primary 815 // in all Chinese tailorings except "unihan". 816 // Replace U+4E00 with the first boundary string from the tailoring. 817 // TODO: This becomes obsolete when the root collator gets 818 // reliable script-first-primary mappings. 819 int32_t hanIndex = binarySearch( 820 *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_); 821 if (hanIndex >= 0) { 822 UnicodeString *fh = new UnicodeString(firstHanBoundary); 823 if (fh == NULL) { 824 errorCode = U_MEMORY_ALLOCATION_ERROR; 825 return FALSE; 826 } 827 firstCharsInScripts_->setElementAt(fh, hanIndex); 828 } 829 return TRUE; 830 } else { 831 return FALSE; 832 } 833 } 834 835 836 /* 837 * Return the string with interspersed CGJs. Input must have more than 2 codepoints. 838 */ 839 static const UChar CGJ = 0x034F; 840 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { 841 UnicodeString result; 842 if (item.length() == 0) { 843 return result; 844 } 845 int32_t i = 0; 846 for (;;) { 847 UChar32 cp = item.char32At(i); 848 result.append(cp); 849 i = item.moveIndex32(i, 1); 850 if (i >= item.length()) { 851 break; 852 } 853 result.append(CGJ); 854 } 855 return result; 856 } 857 858 859 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { 860 return FALSE; 861 } 862 863 864 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { 865 return FALSE; 866 } 867 868 869 const RuleBasedCollator &AlphabeticIndex::getCollator() const { 870 // There are no known non-RuleBasedCollator collators, and none ever expected. 871 // But, in case that changes, better a null pointer than a wrong type. 872 return *dynamic_cast<RuleBasedCollator *>(collator_); 873 } 874 875 876 const UnicodeString &AlphabeticIndex::getInflowLabel() const { 877 return inflowLabel_; 878 } 879 880 const UnicodeString &AlphabeticIndex::getOverflowLabel() const { 881 return overflowLabel_; 882 } 883 884 885 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { 886 return underflowLabel_; 887 } 888 889 890 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 891 inflowLabel_ = label; 892 clearBuckets(); 893 return *this; 894 } 895 896 897 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 898 overflowLabel_ = label; 899 clearBuckets(); 900 return *this; 901 } 902 903 904 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 905 underflowLabel_ = label; 906 clearBuckets(); 907 return *this; 908 } 909 910 911 int32_t AlphabeticIndex::getMaxLabelCount() const { 912 return maxLabelCount_; 913 } 914 915 916 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { 917 if (U_FAILURE(status)) { 918 return *this; 919 } 920 if (maxLabelCount <= 0) { 921 status = U_ILLEGAL_ARGUMENT_ERROR; 922 return *this; 923 } 924 maxLabelCount_ = maxLabelCount; 925 clearBuckets(); 926 return *this; 927 } 928 929 930 // 931 // init() - Common code for constructors. 932 // 933 934 void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { 935 if (U_FAILURE(status)) { return; } 936 if (locale == NULL && collator_ == NULL) { 937 status = U_ILLEGAL_ARGUMENT_ERROR; 938 return; 939 } 940 941 initialLabels_ = new UnicodeSet(); 942 if (initialLabels_ == NULL) { 943 status = U_MEMORY_ALLOCATION_ERROR; 944 return; 945 } 946 947 inflowLabel_.setTo((UChar)0x2026); // Ellipsis 948 overflowLabel_ = inflowLabel_; 949 underflowLabel_ = inflowLabel_; 950 951 if (collator_ == NULL) { 952 collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status)); 953 if (U_FAILURE(status)) { return; } 954 if (collator_ == NULL) { 955 status = U_MEMORY_ALLOCATION_ERROR; 956 return; 957 } 958 } 959 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone()); 960 if (collatorPrimaryOnly_ == NULL) { 961 status = U_MEMORY_ALLOCATION_ERROR; 962 return; 963 } 964 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); 965 firstCharsInScripts_ = firstStringsInScript(status); 966 if (U_FAILURE(status)) { return; } 967 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); 968 969 // Add index exemplar characters before checking the script boundaries, 970 // since this might modify them. 971 addIndexExemplars(locale, status); 972 973 UnicodeString _4E00((UChar)0x4E00); 974 int32_t hanIndex = binarySearch( 975 *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_); 976 if (hanIndex >= 0) { 977 // Adjust the Han script boundary if necessary. 978 // TODO: This becomes obsolete when the root collator gets 979 // reliable script-first-primary mappings. 980 UnicodeString _1100((UChar)0x1100); 981 UnicodeString _1112((UChar)0x1112); 982 UnicodeString _4E9C((UChar)0x4E9C); 983 if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 && 984 collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) { 985 // The standard Korean tailoring sorts Hanja (Han characters) 986 // as secondary differences from Hangul syllables. 987 // This makes U+4E00 not useful as a Han-script boundary. 988 firstCharsInScripts_->removeElementAt(hanIndex); 989 } else if (collatorPrimaryOnly_->compare(_4E9C, _4E00, status) < 0) { 990 // The standard Japanese tailoring sorts U+4E9C first among Kanji. 991 UnicodeString *fh = new UnicodeString(_4E9C); 992 if (fh == NULL) { 993 status = U_MEMORY_ALLOCATION_ERROR; 994 return; 995 } 996 firstCharsInScripts_->setElementAt(fh, hanIndex); 997 } 998 } 999 1000 // Guard against a degenerate collator where 1001 // some script boundary strings are primary ignorable. 1002 for (;;) { 1003 if (U_FAILURE(status)) { return; } 1004 if (firstCharsInScripts_->isEmpty()) { 1005 // AlphabeticIndex requires some non-ignorable script boundary strings. 1006 status = U_ILLEGAL_ARGUMENT_ERROR; 1007 return; 1008 } 1009 if (collatorPrimaryOnly_->compare( 1010 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)), 1011 emptyString_, status) == UCOL_EQUAL) { 1012 firstCharsInScripts_->removeElementAt(0); 1013 } else { 1014 break; 1015 } 1016 } 1017 } 1018 1019 1020 // 1021 // Comparison function for UVector<UnicodeString *> sorting with a collator. 1022 // 1023 static int32_t U_CALLCONV 1024 collatorComparator(const void *context, const void *left, const void *right) { 1025 const UElement *leftElement = static_cast<const UElement *>(left); 1026 const UElement *rightElement = static_cast<const UElement *>(right); 1027 const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer); 1028 const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer); 1029 1030 if (leftString == rightString) { 1031 // Catches case where both are NULL 1032 return 0; 1033 } 1034 if (leftString == NULL) { 1035 return 1; 1036 }; 1037 if (rightString == NULL) { 1038 return -1; 1039 } 1040 const Collator *col = static_cast<const Collator *>(context); 1041 UErrorCode errorCode = U_ZERO_ERROR; 1042 return col->compare(*leftString, *rightString, errorCode); 1043 } 1044 1045 // 1046 // Comparison function for UVector<Record *> sorting with a collator. 1047 // 1048 static int32_t U_CALLCONV 1049 recordCompareFn(const void *context, const void *left, const void *right) { 1050 const UElement *leftElement = static_cast<const UElement *>(left); 1051 const UElement *rightElement = static_cast<const UElement *>(right); 1052 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer); 1053 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer); 1054 const Collator *col = static_cast<const Collator *>(context); 1055 UErrorCode errorCode = U_ZERO_ERROR; 1056 return col->compare(leftRec->name_, rightRec->name_, errorCode); 1057 } 1058 1059 1060 /** 1061 * This list contains one character per script that has the 1062 * lowest primary weight for that script in the root collator. 1063 * This list will be copied and sorted to account for script reordering. 1064 * 1065 * <p>TODO: This is fragile. If the first character of a script is tailored 1066 * so that it does not map to the script's lowest primary weight any more, 1067 * then the buckets will be off. 1068 * There are hacks in the code to handle the known CJK tailorings of U+4E00. 1069 * 1070 * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a. 1071 * 1072 * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in 1073 * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java 1074 */ 1075 static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { 1076 0x41, 0, 0x03B1, 0, 1077 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0, 1078 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0, 1079 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, 1080 0xAAF2, 0, // Meetei Mayek 1081 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, 1082 U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada 1083 U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri 1084 0x1B83, 0, 1085 0xD802, 0xDE00, 0, 0x0E01, 0, 1086 0x0EDE, 0, // Lao 1087 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0, 1088 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, 1089 U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma 1090 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0, 1091 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0, 1092 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0, 1093 U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao 1094 0xD800, 0xDE80, 0, 1095 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0, 1096 0xD801, 0xDC80, 0, 1097 U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng 1098 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0, 1099 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, 1100 U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive 1101 U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs 1102 0x4E00, 0, 1103 // TODO: The overflow bucket's lowerBoundary string should be the 1104 // first item after the last reordering group in the collator's script order. 1105 // This should normally be the first Unicode code point 1106 // that is unassigned (U+0378 in Unicode 6.3) and untailored. 1107 // However, at least up to ICU 51 the Hani reordering group includes 1108 // unassigned code points, 1109 // and there is no stable string for the start of the trailing-weights range. 1110 // The only known string that sorts "high" is U+FFFF. 1111 // When ICU separates Hani vs. unassigned reordering groups, we need to fix this, 1112 // and fix relevant test code. 1113 // Ideally, FractionalUCA.txt will have a "script first primary" 1114 // for unassigned code points. 1115 0xFFFF, 0 1116 }; 1117 1118 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { 1119 if (U_FAILURE(status)) { 1120 return NULL; 1121 } 1122 UVector *dest = new UVector(status); 1123 if (dest == NULL) { 1124 status = U_MEMORY_ALLOCATION_ERROR; 1125 return NULL; 1126 } 1127 dest->setDeleter(uprv_deleteUObject); 1128 const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; 1129 const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS); 1130 do { 1131 if (U_FAILURE(status)) { 1132 return dest; 1133 } 1134 UnicodeString *str = new UnicodeString(src, -1); 1135 if (str == NULL) { 1136 status = U_MEMORY_ALLOCATION_ERROR; 1137 return dest; 1138 } 1139 dest->addElement(str, status); 1140 src += str->length() + 1; 1141 } while (src < limit); 1142 return dest; 1143 } 1144 1145 1146 namespace { 1147 1148 /** 1149 * Returns true if one index character string is "better" than the other. 1150 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is 1151 * better, and otherwise binary-less-than is better. 1152 */ 1153 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, 1154 const UnicodeString &one, const UnicodeString &other) { 1155 // This is called with primary-equal strings, but never with one.equals(other). 1156 UErrorCode status = U_ZERO_ERROR; 1157 UnicodeString n1 = nfkdNormalizer.normalize(one, status); 1158 UnicodeString n2 = nfkdNormalizer.normalize(other, status); 1159 if (U_FAILURE(status)) { return FALSE; } 1160 int32_t result = n1.countChar32() - n2.countChar32(); 1161 if (result != 0) { 1162 return result < 0; 1163 } 1164 result = n1.compareCodePointOrder(n2); 1165 if (result != 0) { 1166 return result < 0; 1167 } 1168 return one.compareCodePointOrder(other) < 0; 1169 } 1170 1171 } // namespace 1172 1173 // 1174 // Constructor & Destructor for AlphabeticIndex::Record 1175 // 1176 // Records are internal only, instances are not directly surfaced in the public API. 1177 // This class is mostly struct-like, with all public fields. 1178 1179 AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data) 1180 : name_(name), data_(data) {} 1181 1182 AlphabeticIndex::Record::~Record() { 1183 } 1184 1185 1186 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { 1187 if (U_FAILURE(status)) { 1188 return *this; 1189 } 1190 if (inputList_ == NULL) { 1191 inputList_ = new UVector(status); 1192 if (inputList_ == NULL) { 1193 status = U_MEMORY_ALLOCATION_ERROR; 1194 return *this; 1195 } 1196 inputList_->setDeleter(alphaIndex_deleteRecord); 1197 } 1198 Record *r = new Record(name, data); 1199 if (r == NULL) { 1200 status = U_MEMORY_ALLOCATION_ERROR; 1201 return *this; 1202 } 1203 inputList_->addElement(r, status); 1204 clearBuckets(); 1205 //std::string ss; 1206 //std::string ss2; 1207 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << 1208 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; 1209 return *this; 1210 } 1211 1212 1213 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { 1214 if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) { 1215 inputList_->removeAllElements(); 1216 clearBuckets(); 1217 } 1218 return *this; 1219 } 1220 1221 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { 1222 initBuckets(status); 1223 if (U_FAILURE(status)) { 1224 return 0; 1225 } 1226 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status); 1227 } 1228 1229 1230 int32_t AlphabeticIndex::getBucketIndex() const { 1231 return labelsIterIndex_; 1232 } 1233 1234 1235 UBool AlphabeticIndex::nextBucket(UErrorCode &status) { 1236 if (U_FAILURE(status)) { 1237 return FALSE; 1238 } 1239 if (buckets_ == NULL && currentBucket_ != NULL) { 1240 status = U_ENUM_OUT_OF_SYNC_ERROR; 1241 return FALSE; 1242 } 1243 initBuckets(status); 1244 if (U_FAILURE(status)) { 1245 return FALSE; 1246 } 1247 ++labelsIterIndex_; 1248 if (labelsIterIndex_ >= buckets_->getBucketCount()) { 1249 labelsIterIndex_ = buckets_->getBucketCount(); 1250 return FALSE; 1251 } 1252 currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_); 1253 resetRecordIterator(); 1254 return TRUE; 1255 } 1256 1257 const UnicodeString &AlphabeticIndex::getBucketLabel() const { 1258 if (currentBucket_ != NULL) { 1259 return currentBucket_->label_; 1260 } else { 1261 return emptyString_; 1262 } 1263 } 1264 1265 1266 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { 1267 if (currentBucket_ != NULL) { 1268 return currentBucket_->labelType_; 1269 } else { 1270 return U_ALPHAINDEX_NORMAL; 1271 } 1272 } 1273 1274 1275 int32_t AlphabeticIndex::getBucketRecordCount() const { 1276 if (currentBucket_ != NULL && currentBucket_->records_ != NULL) { 1277 return currentBucket_->records_->size(); 1278 } else { 1279 return 0; 1280 } 1281 } 1282 1283 1284 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { 1285 if (U_FAILURE(status)) { 1286 return *this; 1287 } 1288 internalResetBucketIterator(); 1289 return *this; 1290 } 1291 1292 1293 UBool AlphabeticIndex::nextRecord(UErrorCode &status) { 1294 if (U_FAILURE(status)) { 1295 return FALSE; 1296 } 1297 if (currentBucket_ == NULL) { 1298 // We are trying to iterate over the items in a bucket, but there is no 1299 // current bucket from the enumeration of buckets. 1300 status = U_INVALID_STATE_ERROR; 1301 return FALSE; 1302 } 1303 if (buckets_ == NULL) { 1304 status = U_ENUM_OUT_OF_SYNC_ERROR; 1305 return FALSE; 1306 } 1307 if (currentBucket_->records_ == NULL) { 1308 return FALSE; 1309 } 1310 ++itemsIterIndex_; 1311 if (itemsIterIndex_ >= currentBucket_->records_->size()) { 1312 itemsIterIndex_ = currentBucket_->records_->size(); 1313 return FALSE; 1314 } 1315 return TRUE; 1316 } 1317 1318 1319 const UnicodeString &AlphabeticIndex::getRecordName() const { 1320 const UnicodeString *retStr = &emptyString_; 1321 if (currentBucket_ != NULL && currentBucket_->records_ != NULL && 1322 itemsIterIndex_ >= 0 && 1323 itemsIterIndex_ < currentBucket_->records_->size()) { 1324 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); 1325 retStr = &item->name_; 1326 } 1327 return *retStr; 1328 } 1329 1330 const void *AlphabeticIndex::getRecordData() const { 1331 const void *retPtr = NULL; 1332 if (currentBucket_ != NULL && currentBucket_->records_ != NULL && 1333 itemsIterIndex_ >= 0 && 1334 itemsIterIndex_ < currentBucket_->records_->size()) { 1335 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); 1336 retPtr = item->data_; 1337 } 1338 return retPtr; 1339 } 1340 1341 1342 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { 1343 itemsIterIndex_ = -1; 1344 return *this; 1345 } 1346 1347 1348 1349 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, 1350 const UnicodeString &lowerBoundary, 1351 UAlphabeticIndexLabelType type) 1352 : label_(label), lowerBoundary_(lowerBoundary), labelType_(type), 1353 displayBucket_(NULL), displayIndex_(-1), 1354 records_(NULL) { 1355 } 1356 1357 1358 AlphabeticIndex::Bucket::~Bucket() { 1359 delete records_; 1360 } 1361 1362 U_NAMESPACE_END 1363 1364 #endif 1365