1 /* 2 ******************************************************************************* 3 * Copyright (C) 2009-2013, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION 11 12 #include "unicode/alphaindex.h" 13 #include "unicode/coleitr.h" 14 #include "unicode/coll.h" 15 #include "unicode/localpointer.h" 16 #include "unicode/normalizer2.h" 17 #include "unicode/tblcoll.h" 18 #include "unicode/ulocdata.h" 19 #include "unicode/uniset.h" 20 #include "unicode/uobject.h" 21 #include "unicode/usetiter.h" 22 #include "unicode/utf16.h" 23 24 #include "cmemory.h" 25 #include "cstring.h" 26 #include "uassert.h" 27 #include "uvector.h" 28 29 //#include <string> 30 //#include <iostream> 31 32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 33 34 U_NAMESPACE_BEGIN 35 36 namespace { 37 38 /** 39 * Prefix string for Chinese index buckets. 40 * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes 41 */ 42 const UChar BASE[1] = { 0xFDD0 }; 43 const int32_t BASE_LENGTH = 1; 44 45 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, 46 const UnicodeString &one, const UnicodeString &other); 47 48 } // namespace 49 50 static int32_t U_CALLCONV 51 collatorComparator(const void *context, const void *left, const void *right); 52 53 static int32_t U_CALLCONV 54 recordCompareFn(const void *context, const void *left, const void *right); 55 56 // UVector<Record *> support function, delete a Record. 57 static void U_CALLCONV 58 alphaIndex_deleteRecord(void *obj) { 59 delete static_cast<AlphabeticIndex::Record *>(obj); 60 } 61 62 namespace { 63 64 UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned, 65 UErrorCode &errorCode) { 66 if (U_FAILURE(errorCode)) { return NULL; } 67 if (owned.isValid()) { 68 return owned.orphan(); 69 } 70 UnicodeString *p = new UnicodeString(s); 71 if (p == NULL) { 72 errorCode = U_MEMORY_ALLOCATION_ERROR; 73 } 74 return p; 75 } 76 77 inline UnicodeString *getString(const UVector &list, int32_t i) { 78 return static_cast<UnicodeString *>(list[i]); 79 } 80 81 inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) { 82 return static_cast<AlphabeticIndex::Bucket *>(list[i]); 83 } 84 85 inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) { 86 return static_cast<AlphabeticIndex::Record *>(list[i]); 87 } 88 89 /** 90 * Like Java Collections.binarySearch(List, String, Comparator). 91 * 92 * @return the index>=0 where the item was found, 93 * or the index<0 for inserting the string at ~index in sorted order 94 */ 95 int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) { 96 if (list.size() == 0) { return ~0; } 97 int32_t start = 0; 98 int32_t limit = list.size(); 99 for (;;) { 100 int32_t i = (start + limit) / 2; 101 const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i)); 102 UErrorCode errorCode = U_ZERO_ERROR; 103 UCollationResult cmp = coll.compare(s, *si, errorCode); 104 if (cmp == UCOL_EQUAL) { 105 return i; 106 } else if (cmp < 0) { 107 if (i == start) { 108 return ~start; // insert s before *si 109 } 110 limit = i; 111 } else { 112 if (i == start) { 113 return ~(start + 1); // insert s after *si 114 } 115 start = i; 116 } 117 } 118 } 119 120 } // namespace 121 122 // The BucketList is not in the anonymous namespace because only Clang 123 // seems to support its use in other classes from there. 124 // However, we also don't need U_I18N_API because it is not used from outside the i18n library. 125 class BucketList : public UObject { 126 public: 127 BucketList(UVector *bucketList, UVector *publicBucketList) 128 : bucketList_(bucketList), immutableVisibleList_(publicBucketList) { 129 int32_t displayIndex = 0; 130 for (int32_t i = 0; i < publicBucketList->size(); ++i) { 131 getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++; 132 } 133 } 134 135 virtual ~BucketList() { 136 delete bucketList_; 137 if (immutableVisibleList_ != bucketList_) { 138 delete immutableVisibleList_; 139 } 140 } 141 142 int32_t getBucketCount() const { 143 return immutableVisibleList_->size(); 144 } 145 146 int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly, 147 UErrorCode &errorCode) { 148 // binary search 149 int32_t start = 0; 150 int32_t limit = bucketList_->size(); 151 while ((start + 1) < limit) { 152 int32_t i = (start + limit) / 2; 153 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i); 154 UCollationResult nameVsBucket = 155 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode); 156 if (nameVsBucket < 0) { 157 limit = i; 158 } else { 159 start = i; 160 } 161 } 162 const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start); 163 if (bucket->displayBucket_ != NULL) { 164 bucket = bucket->displayBucket_; 165 } 166 return bucket->displayIndex_; 167 } 168 169 /** All of the buckets, visible and invisible. */ 170 UVector *bucketList_; 171 /** Just the visible buckets. */ 172 UVector *immutableVisibleList_; 173 }; 174 175 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() { 176 delete buckets_; 177 delete collatorPrimaryOnly_; 178 } 179 180 int32_t 181 AlphabeticIndex::ImmutableIndex::getBucketCount() const { 182 return buckets_->getBucketCount(); 183 } 184 185 int32_t 186 AlphabeticIndex::ImmutableIndex::getBucketIndex( 187 const UnicodeString &name, UErrorCode &errorCode) const { 188 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode); 189 } 190 191 const AlphabeticIndex::Bucket * 192 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const { 193 if (0 <= index && index < buckets_->getBucketCount()) { 194 return icu::getBucket(*buckets_->immutableVisibleList_, index); 195 } else { 196 return NULL; 197 } 198 } 199 200 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status) 201 : inputList_(NULL), 202 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), 203 maxLabelCount_(99), 204 initialLabels_(NULL), firstCharsInScripts_(NULL), 205 collator_(NULL), collatorPrimaryOnly_(NULL), 206 buckets_(NULL) { 207 init(&locale, status); 208 } 209 210 211 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status) 212 : inputList_(NULL), 213 labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL), 214 maxLabelCount_(99), 215 initialLabels_(NULL), firstCharsInScripts_(NULL), 216 collator_(collator), collatorPrimaryOnly_(NULL), 217 buckets_(NULL) { 218 init(NULL, status); 219 } 220 221 222 223 AlphabeticIndex::~AlphabeticIndex() { 224 delete collator_; 225 delete collatorPrimaryOnly_; 226 delete firstCharsInScripts_; 227 delete buckets_; 228 delete inputList_; 229 delete initialLabels_; 230 } 231 232 233 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) { 234 if (U_FAILURE(status)) { 235 return *this; 236 } 237 initialLabels_->addAll(additions); 238 clearBuckets(); 239 return *this; 240 } 241 242 243 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) { 244 addIndexExemplars(locale, status); 245 clearBuckets(); 246 return *this; 247 } 248 249 250 AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) { 251 if (U_FAILURE(errorCode)) { return NULL; } 252 // In C++, the ImmutableIndex must own its copy of the BucketList, 253 // even if it contains no records, for proper memory management. 254 // We could clone the buckets_ if they are not NULL, 255 // but that would be worth it only if this method is called multiple times, 256 // or called after using the old-style bucket iterator API. 257 LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode)); 258 LocalPointer<RuleBasedCollator> coll( 259 static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone())); 260 if (immutableBucketList.isNull() || coll.isNull()) { 261 errorCode = U_MEMORY_ALLOCATION_ERROR; 262 return NULL; 263 } 264 ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias()); 265 if (immIndex == NULL) { 266 errorCode = U_MEMORY_ALLOCATION_ERROR; 267 return NULL; 268 } 269 // The ImmutableIndex adopted its parameter objects. 270 immutableBucketList.orphan(); 271 coll.orphan(); 272 return immIndex; 273 } 274 275 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) { 276 initBuckets(status); 277 if (U_FAILURE(status)) { 278 return 0; 279 } 280 return buckets_->getBucketCount(); 281 } 282 283 284 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) { 285 if (U_FAILURE(status) || inputList_ == NULL) { 286 return 0; 287 } 288 return inputList_->size(); 289 } 290 291 void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const { 292 const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode); 293 if (U_FAILURE(errorCode)) { return; } 294 295 const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0); 296 const UnicodeString &overflowBoundary = 297 *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1); 298 299 // We make a sorted array of elements. 300 // Some of the input may be redundant. 301 // That is, we might have c, ch, d, where "ch" sorts just like "c", "h". 302 // We filter out those cases. 303 UnicodeSetIterator iter(*initialLabels_); 304 while (iter.next()) { 305 const UnicodeString *item = &iter.getString(); 306 LocalPointer<UnicodeString> ownedItem; 307 UBool checkDistinct; 308 int32_t itemLength = item->length(); 309 if (!item->hasMoreChar32Than(0, itemLength, 1)) { 310 checkDistinct = FALSE; 311 } else if(item->charAt(itemLength - 1) == 0x2a && // '*' 312 item->charAt(itemLength - 2) != 0x2a) { 313 // Use a label if it is marked with one trailing star, 314 // even if the label string sorts the same when all contractions are suppressed. 315 ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1)); 316 item = ownedItem.getAlias(); 317 if (item == NULL) { 318 errorCode = U_MEMORY_ALLOCATION_ERROR; 319 return; 320 } 321 checkDistinct = FALSE; 322 } else { 323 checkDistinct = TRUE; 324 } 325 if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) { 326 // Ignore a primary-ignorable or non-alphabetic index character. 327 } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) { 328 // Ignore an index characters that will land in the overflow bucket. 329 } else if (checkDistinct && 330 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) { 331 // Ignore a multi-code point index character that does not sort distinctly 332 // from the sequence of its separate characters. 333 } else { 334 int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_); 335 if (insertionPoint < 0) { 336 indexCharacters.insertElementAt( 337 ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode); 338 } else { 339 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint); 340 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) { 341 indexCharacters.setElementAt( 342 ownedString(*item, ownedItem, errorCode), insertionPoint); 343 } 344 } 345 } 346 } 347 if (U_FAILURE(errorCode)) { return; } 348 349 // if the result is still too large, cut down to maxCount elements, by removing every nth element 350 351 int32_t size = indexCharacters.size() - 1; 352 if (size > maxLabelCount_) { 353 int32_t count = 0; 354 int32_t old = -1; 355 for (int32_t i = 0; i < indexCharacters.size();) { 356 ++count; 357 int32_t bump = count * maxLabelCount_ / size; 358 if (bump == old) { 359 indexCharacters.removeElementAt(i); 360 } else { 361 old = bump; 362 ++i; 363 } 364 } 365 } 366 } 367 368 namespace { 369 370 const UnicodeString &fixLabel(const UnicodeString ¤t, UnicodeString &temp) { 371 if (!current.startsWith(BASE, BASE_LENGTH)) { 372 return current; 373 } 374 UChar rest = current.charAt(BASE_LENGTH); 375 if (0x2800 < rest && rest <= 0x28FF) { // stroke count 376 int32_t count = rest-0x2800; 377 temp.setTo((UChar)(0x30 + count % 10)); 378 if (count >= 10) { 379 count /= 10; 380 temp.insert(0, (UChar)(0x30 + count % 10)); 381 if (count >= 10) { 382 count /= 10; 383 temp.insert(0, (UChar)(0x30 + count)); 384 } 385 } 386 return temp.append((UChar)0x5283); 387 } 388 return temp.setTo(current, BASE_LENGTH); 389 } 390 391 UBool hasMultiplePrimaryWeights( 392 CollationElementIterator &cei, int32_t variableTop, 393 const UnicodeString &s, UErrorCode &errorCode) { 394 cei.setText(s, errorCode); 395 UBool seenPrimary = FALSE; 396 for (;;) { 397 int32_t ce32 = cei.next(errorCode); 398 if (ce32 == CollationElementIterator::NULLORDER) { 399 break; 400 } 401 int32_t p = CollationElementIterator::primaryOrder(ce32); 402 if (p > variableTop && (ce32 & 0xc0) != 0xc0) { 403 // not primary ignorable, and not a continuation CE 404 if (seenPrimary) { 405 return TRUE; 406 } 407 seenPrimary = TRUE; 408 } 409 } 410 return FALSE; 411 } 412 413 } // namespace 414 415 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const { 416 // Initialize indexCharacters. 417 UVector indexCharacters(errorCode); 418 indexCharacters.setDeleter(uprv_deleteUObject); 419 initLabels(indexCharacters, errorCode); 420 if (U_FAILURE(errorCode)) { return NULL; } 421 422 // Variables for hasMultiplePrimaryWeights(). 423 LocalPointer<CollationElementIterator> cei( 424 collatorPrimaryOnly_->createCollationElementIterator(emptyString_)); 425 if (cei.isNull()) { 426 errorCode = U_MEMORY_ALLOCATION_ERROR; 427 return NULL; 428 } 429 int32_t variableTop; 430 if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) { 431 variableTop = CollationElementIterator::primaryOrder( 432 (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode)); 433 } else { 434 variableTop = 0; 435 } 436 UBool hasInvisibleBuckets = FALSE; 437 438 // Helper arrays for Chinese Pinyin collation. 439 Bucket *asciiBuckets[26] = { 440 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 441 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL 442 }; 443 Bucket *pinyinBuckets[26] = { 444 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 445 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL 446 }; 447 UBool hasPinyin = FALSE; 448 449 LocalPointer<UVector> bucketList(new UVector(errorCode)); 450 if (bucketList.isNull()) { 451 errorCode = U_MEMORY_ALLOCATION_ERROR; 452 return NULL; 453 } 454 bucketList->setDeleter(uprv_deleteUObject); 455 456 // underflow bucket 457 Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW); 458 if (bucket == NULL) { 459 errorCode = U_MEMORY_ALLOCATION_ERROR; 460 return NULL; 461 } 462 bucketList->addElement(bucket, errorCode); 463 if (U_FAILURE(errorCode)) { return NULL; } 464 465 UnicodeString temp; 466 467 // fix up the list, adding underflow, additions, overflow 468 // Insert inflow labels as needed. 469 int32_t scriptIndex = -1; 470 const UnicodeString *scriptUpperBoundary = &emptyString_; 471 for (int32_t i = 0; i < indexCharacters.size(); ++i) { 472 UnicodeString ¤t = *getString(indexCharacters, i); 473 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) { 474 // We crossed the script boundary into a new script. 475 const UnicodeString &inflowBoundary = *scriptUpperBoundary; 476 UBool skippedScript = FALSE; 477 for (;;) { 478 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex); 479 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) { 480 break; 481 } 482 skippedScript = TRUE; 483 } 484 if (skippedScript && bucketList->size() > 1) { 485 // We are skipping one or more scripts, 486 // and we are not just getting out of the underflow label. 487 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW); 488 if (bucket == NULL) { 489 errorCode = U_MEMORY_ALLOCATION_ERROR; 490 return NULL; 491 } 492 bucketList->addElement(bucket, errorCode); 493 } 494 } 495 // Add a bucket with the current label. 496 bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL); 497 if (bucket == NULL) { 498 errorCode = U_MEMORY_ALLOCATION_ERROR; 499 return NULL; 500 } 501 bucketList->addElement(bucket, errorCode); 502 // Remember ASCII and Pinyin buckets for Pinyin redirects. 503 UChar c; 504 if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) { // A-Z 505 asciiBuckets[c - 0x41] = bucket; 506 } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) && 507 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) { 508 pinyinBuckets[c - 0x41] = bucket; 509 hasPinyin = TRUE; 510 } 511 // Check for multiple primary weights. 512 if (!current.startsWith(BASE, BASE_LENGTH) && 513 hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) && 514 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) { 515 // "AE-ligature" or "Sch" etc. 516 for (int32_t i = bucketList->size() - 2;; --i) { 517 Bucket *singleBucket = getBucket(*bucketList, i); 518 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 519 // There is no single-character bucket since the last 520 // underflow or inflow label. 521 break; 522 } 523 if (singleBucket->displayBucket_ == NULL && 524 !hasMultiplePrimaryWeights( 525 *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) { 526 // Add an invisible bucket that redirects strings greater than the expansion 527 // to the previous single-character bucket. 528 // For example, after ... Q R S Sch we add Sch\uFFFF->S 529 // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S. 530 bucket = new Bucket(emptyString_, 531 UnicodeString(current).append((UChar)0xFFFF), 532 U_ALPHAINDEX_NORMAL); 533 if (bucket == NULL) { 534 errorCode = U_MEMORY_ALLOCATION_ERROR; 535 return NULL; 536 } 537 bucket->displayBucket_ = singleBucket; 538 bucketList->addElement(bucket, errorCode); 539 hasInvisibleBuckets = TRUE; 540 break; 541 } 542 } 543 } 544 } 545 if (U_FAILURE(errorCode)) { return NULL; } 546 if (bucketList->size() == 1) { 547 // No real labels, show only the underflow label. 548 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); 549 if (bl == NULL) { 550 errorCode = U_MEMORY_ALLOCATION_ERROR; 551 return NULL; 552 } 553 bucketList.orphan(); 554 return bl; 555 } 556 // overflow bucket 557 bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW); 558 if (bucket == NULL) { 559 errorCode = U_MEMORY_ALLOCATION_ERROR; 560 return NULL; 561 } 562 bucketList->addElement(bucket, errorCode); // final 563 564 if (hasPinyin) { 565 // Redirect Pinyin buckets. 566 Bucket *asciiBucket = NULL; 567 for (int32_t i = 0; i < 26; ++i) { 568 if (asciiBuckets[i] != NULL) { 569 asciiBucket = asciiBuckets[i]; 570 } 571 if (pinyinBuckets[i] != NULL && asciiBucket != NULL) { 572 pinyinBuckets[i]->displayBucket_ = asciiBucket; 573 hasInvisibleBuckets = TRUE; 574 } 575 } 576 } 577 578 if (U_FAILURE(errorCode)) { return NULL; } 579 if (!hasInvisibleBuckets) { 580 BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias()); 581 if (bl == NULL) { 582 errorCode = U_MEMORY_ALLOCATION_ERROR; 583 return NULL; 584 } 585 bucketList.orphan(); 586 return bl; 587 } 588 // Merge inflow buckets that are visually adjacent. 589 // Iterate backwards: Merge inflow into overflow rather than the other way around. 590 int32_t i = bucketList->size() - 1; 591 Bucket *nextBucket = getBucket(*bucketList, i); 592 while (--i > 0) { 593 bucket = getBucket(*bucketList, i); 594 if (bucket->displayBucket_ != NULL) { 595 continue; // skip invisible buckets 596 } 597 if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) { 598 if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) { 599 bucket->displayBucket_ = nextBucket; 600 continue; 601 } 602 } 603 nextBucket = bucket; 604 } 605 606 LocalPointer<UVector> publicBucketList(new UVector(errorCode)); 607 if (bucketList.isNull()) { 608 errorCode = U_MEMORY_ALLOCATION_ERROR; 609 return NULL; 610 } 611 // Do not call publicBucketList->setDeleter(): 612 // This vector shares its objects with the bucketList. 613 for (int32_t i = 0; i < bucketList->size(); ++i) { 614 bucket = getBucket(*bucketList, i); 615 if (bucket->displayBucket_ == NULL) { 616 publicBucketList->addElement(bucket, errorCode); 617 } 618 } 619 if (U_FAILURE(errorCode)) { return NULL; } 620 BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias()); 621 if (bl == NULL) { 622 errorCode = U_MEMORY_ALLOCATION_ERROR; 623 return NULL; 624 } 625 bucketList.orphan(); 626 publicBucketList.orphan(); 627 return bl; 628 } 629 630 /** 631 * Creates an index, and buckets and sorts the list of records into the index. 632 */ 633 void AlphabeticIndex::initBuckets(UErrorCode &errorCode) { 634 if (U_FAILURE(errorCode) || buckets_ != NULL) { 635 return; 636 } 637 buckets_ = createBucketList(errorCode); 638 if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) { 639 return; 640 } 641 642 // Sort the records by name. 643 // Stable sort preserves input order of collation duplicates. 644 inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode); 645 646 // Now, we traverse all of the input, which is now sorted. 647 // If the item doesn't go in the current bucket, we find the next bucket that contains it. 648 // This makes the process order n*log(n), since we just sort the list and then do a linear process. 649 // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so 650 // we need to improve it for that case. 651 652 Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0); 653 int32_t bucketIndex = 1; 654 Bucket *nextBucket; 655 const UnicodeString *upperBoundary; 656 if (bucketIndex < buckets_->bucketList_->size()) { 657 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); 658 upperBoundary = &nextBucket->lowerBoundary_; 659 } else { 660 nextBucket = NULL; 661 upperBoundary = NULL; 662 } 663 for (int32_t i = 0; i < inputList_->size(); ++i) { 664 Record *r = getRecord(*inputList_, i); 665 // if the current bucket isn't the right one, find the one that is 666 // We have a special flag for the last bucket so that we don't look any further 667 while (upperBoundary != NULL && 668 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) { 669 currentBucket = nextBucket; 670 // now reset the boundary that we compare against 671 if (bucketIndex < buckets_->bucketList_->size()) { 672 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++); 673 upperBoundary = &nextBucket->lowerBoundary_; 674 } else { 675 upperBoundary = NULL; 676 } 677 } 678 // now put the record into the bucket. 679 Bucket *bucket = currentBucket; 680 if (bucket->displayBucket_ != NULL) { 681 bucket = bucket->displayBucket_; 682 } 683 if (bucket->records_ == NULL) { 684 bucket->records_ = new UVector(errorCode); 685 if (bucket->records_ == NULL) { 686 errorCode = U_MEMORY_ALLOCATION_ERROR; 687 return; 688 } 689 } 690 bucket->records_->addElement(r, errorCode); 691 } 692 } 693 694 void AlphabeticIndex::clearBuckets() { 695 if (buckets_ != NULL) { 696 delete buckets_; 697 buckets_ = NULL; 698 internalResetBucketIterator(); 699 } 700 } 701 702 void AlphabeticIndex::internalResetBucketIterator() { 703 labelsIterIndex_ = -1; 704 currentBucket_ = NULL; 705 } 706 707 708 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) { 709 if (U_FAILURE(status)) { return; } 710 // Chinese index characters, which are specific to each of the several Chinese tailorings, 711 // take precedence over the single locale data exemplar set per language. 712 const char *language = locale.getLanguage(); 713 if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 || 714 uprv_strcmp(language, "ko") == 0) { 715 // TODO: This should be done regardless of the language, but it's expensive. 716 // We should add a Collator function (can be @internal) 717 // to enumerate just the contractions that start with a given code point or string. 718 if (addChineseIndexCharacters(status) || U_FAILURE(status)) { 719 return; 720 } 721 } 722 723 LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status)); 724 if (U_FAILURE(status)) { 725 return; 726 } 727 728 UnicodeSet exemplars; 729 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status); 730 if (U_SUCCESS(status)) { 731 initialLabels_->addAll(exemplars); 732 return; 733 } 734 status = U_ZERO_ERROR; // Clear out U_MISSING_RESOURCE_ERROR 735 736 // The locale data did not include explicit Index characters. 737 // Synthesize a set of them from the locale's standard exemplar characters. 738 ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status); 739 if (U_FAILURE(status)) { 740 return; 741 } 742 743 // question: should we add auxiliary exemplars? 744 if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) { 745 exemplars.add(0x61, 0x7A); 746 } 747 if (exemplars.containsSome(0xAC00, 0xD7A3)) { // Hangul syllables 748 // cut down to small list 749 exemplars.remove(0xAC00, 0xD7A3). 750 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C). 751 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544). 752 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0). 753 add(0xD30C).add(0xD558); 754 } 755 if (exemplars.containsSome(0x1200, 0x137F)) { // Ethiopic block 756 // cut down to small list 757 // make use of the fact that Ethiopic is allocated in 8's, where 758 // the base is 0 mod 8. 759 UnicodeSet ethiopic( 760 UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status); 761 UnicodeSetIterator it(ethiopic); 762 while (it.next() && !it.isString()) { 763 if ((it.getCodepoint() & 0x7) != 0) { 764 exemplars.remove(it.getCodepoint()); 765 } 766 } 767 } 768 769 // Upper-case any that aren't already so. 770 // (We only do this for synthesized index characters.) 771 UnicodeSetIterator it(exemplars); 772 UnicodeString upperC; 773 while (it.next()) { 774 const UnicodeString &exemplarC = it.getString(); 775 upperC = exemplarC; 776 upperC.toUpper(locale); 777 initialLabels_->add(upperC); 778 } 779 } 780 781 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) { 782 UnicodeSet contractions; 783 ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(), 784 contractions.toUSet(), NULL, FALSE, &errorCode); 785 if (U_FAILURE(errorCode)) { return FALSE; } 786 UnicodeString firstHanBoundary; 787 UBool hasPinyin = FALSE; 788 UnicodeSetIterator iter(contractions); 789 while (iter.next()) { 790 const UnicodeString &s = iter.getString(); 791 if (s.startsWith(BASE, BASE_LENGTH)) { 792 initialLabels_->add(s); 793 if (firstHanBoundary.isEmpty() || 794 collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) { 795 firstHanBoundary = s; 796 } 797 UChar c = s.charAt(s.length() - 1); 798 if (0x41 <= c && c <= 0x5A) { // A-Z 799 hasPinyin = TRUE; 800 } 801 } 802 } 803 if (hasPinyin) { 804 initialLabels_->add(0x41, 0x5A); // A-Z 805 } 806 if (!firstHanBoundary.isEmpty()) { 807 // The hardcoded list of script boundaries includes U+4E00 808 // which is tailored to not be the first primary 809 // in all Chinese tailorings except "unihan". 810 // Replace U+4E00 with the first boundary string from the tailoring. 811 // TODO: This becomes obsolete when the root collator gets 812 // reliable script-first-primary mappings. 813 int32_t hanIndex = binarySearch( 814 *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_); 815 if (hanIndex >= 0) { 816 UnicodeString *fh = new UnicodeString(firstHanBoundary); 817 if (fh == NULL) { 818 errorCode = U_MEMORY_ALLOCATION_ERROR; 819 return FALSE; 820 } 821 firstCharsInScripts_->setElementAt(fh, hanIndex); 822 } 823 return TRUE; 824 } else { 825 return FALSE; 826 } 827 } 828 829 830 /* 831 * Return the string with interspersed CGJs. Input must have more than 2 codepoints. 832 */ 833 static const UChar CGJ = 0x034F; 834 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) { 835 UnicodeString result; 836 if (item.length() == 0) { 837 return result; 838 } 839 int32_t i = 0; 840 for (;;) { 841 UChar32 cp = item.char32At(i); 842 result.append(cp); 843 i = item.moveIndex32(i, 1); 844 if (i >= item.length()) { 845 break; 846 } 847 result.append(CGJ); 848 } 849 return result; 850 } 851 852 853 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const { 854 return FALSE; 855 } 856 857 858 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const { 859 return FALSE; 860 } 861 862 863 const RuleBasedCollator &AlphabeticIndex::getCollator() const { 864 // There are no known non-RuleBasedCollator collators, and none ever expected. 865 // But, in case that changes, better a null pointer than a wrong type. 866 return *dynamic_cast<RuleBasedCollator *>(collator_); 867 } 868 869 870 const UnicodeString &AlphabeticIndex::getInflowLabel() const { 871 return inflowLabel_; 872 } 873 874 const UnicodeString &AlphabeticIndex::getOverflowLabel() const { 875 return overflowLabel_; 876 } 877 878 879 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const { 880 return underflowLabel_; 881 } 882 883 884 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 885 inflowLabel_ = label; 886 clearBuckets(); 887 return *this; 888 } 889 890 891 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 892 overflowLabel_ = label; 893 clearBuckets(); 894 return *this; 895 } 896 897 898 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) { 899 underflowLabel_ = label; 900 clearBuckets(); 901 return *this; 902 } 903 904 905 int32_t AlphabeticIndex::getMaxLabelCount() const { 906 return maxLabelCount_; 907 } 908 909 910 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) { 911 if (U_FAILURE(status)) { 912 return *this; 913 } 914 if (maxLabelCount <= 0) { 915 status = U_ILLEGAL_ARGUMENT_ERROR; 916 return *this; 917 } 918 maxLabelCount_ = maxLabelCount; 919 clearBuckets(); 920 return *this; 921 } 922 923 924 // 925 // init() - Common code for constructors. 926 // 927 928 void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) { 929 if (U_FAILURE(status)) { return; } 930 if (locale == NULL && collator_ == NULL) { 931 status = U_ILLEGAL_ARGUMENT_ERROR; 932 return; 933 } 934 935 initialLabels_ = new UnicodeSet(); 936 if (initialLabels_ == NULL) { 937 status = U_MEMORY_ALLOCATION_ERROR; 938 return; 939 } 940 941 inflowLabel_.setTo((UChar)0x2026); // Ellipsis 942 overflowLabel_ = inflowLabel_; 943 underflowLabel_ = inflowLabel_; 944 945 if (collator_ == NULL) { 946 collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status)); 947 if (U_FAILURE(status)) { return; } 948 if (collator_ == NULL) { 949 status = U_MEMORY_ALLOCATION_ERROR; 950 return; 951 } 952 } 953 collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone()); 954 if (collatorPrimaryOnly_ == NULL) { 955 status = U_MEMORY_ALLOCATION_ERROR; 956 return; 957 } 958 collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status); 959 firstCharsInScripts_ = firstStringsInScript(status); 960 if (U_FAILURE(status)) { return; } 961 firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status); 962 UnicodeString _4E00((UChar)0x4E00); 963 UnicodeString _1100((UChar)0x1100); 964 UnicodeString _1112((UChar)0x1112); 965 if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 && 966 collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) { 967 // The standard Korean tailoring sorts Hanja (Han characters) 968 // as secondary differences from Hangul syllables. 969 // This makes U+4E00 not useful as a Han-script boundary. 970 // TODO: This becomes obsolete when the root collator gets 971 // reliable script-first-primary mappings. 972 int32_t hanIndex = binarySearch( 973 *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_); 974 if (hanIndex >= 0) { 975 firstCharsInScripts_->removeElementAt(hanIndex); 976 } 977 } 978 // Guard against a degenerate collator where 979 // some script boundary strings are primary ignorable. 980 for (;;) { 981 if (U_FAILURE(status)) { return; } 982 if (firstCharsInScripts_->isEmpty()) { 983 // AlphabeticIndex requires some non-ignorable script boundary strings. 984 status = U_ILLEGAL_ARGUMENT_ERROR; 985 return; 986 } 987 if (collatorPrimaryOnly_->compare( 988 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)), 989 emptyString_, status) == UCOL_EQUAL) { 990 firstCharsInScripts_->removeElementAt(0); 991 } else { 992 break; 993 } 994 } 995 996 if (locale != NULL) { 997 addIndexExemplars(*locale, status); 998 } 999 } 1000 1001 1002 // 1003 // Comparison function for UVector<UnicodeString *> sorting with a collator. 1004 // 1005 static int32_t U_CALLCONV 1006 collatorComparator(const void *context, const void *left, const void *right) { 1007 const UElement *leftElement = static_cast<const UElement *>(left); 1008 const UElement *rightElement = static_cast<const UElement *>(right); 1009 const UnicodeString *leftString = static_cast<const UnicodeString *>(leftElement->pointer); 1010 const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer); 1011 1012 if (leftString == rightString) { 1013 // Catches case where both are NULL 1014 return 0; 1015 } 1016 if (leftString == NULL) { 1017 return 1; 1018 }; 1019 if (rightString == NULL) { 1020 return -1; 1021 } 1022 const Collator *col = static_cast<const Collator *>(context); 1023 UErrorCode errorCode = U_ZERO_ERROR; 1024 return col->compare(*leftString, *rightString, errorCode); 1025 } 1026 1027 // 1028 // Comparison function for UVector<Record *> sorting with a collator. 1029 // 1030 static int32_t U_CALLCONV 1031 recordCompareFn(const void *context, const void *left, const void *right) { 1032 const UElement *leftElement = static_cast<const UElement *>(left); 1033 const UElement *rightElement = static_cast<const UElement *>(right); 1034 const AlphabeticIndex::Record *leftRec = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer); 1035 const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer); 1036 const Collator *col = static_cast<const Collator *>(context); 1037 UErrorCode errorCode = U_ZERO_ERROR; 1038 return col->compare(leftRec->name_, rightRec->name_, errorCode); 1039 } 1040 1041 1042 /** 1043 * This list contains one character per script that has the 1044 * lowest primary weight for that script in the root collator. 1045 * This list will be copied and sorted to account for script reordering. 1046 * 1047 * <p>TODO: This is fragile. If the first character of a script is tailored 1048 * so that it does not map to the script's lowest primary weight any more, 1049 * then the buckets will be off. 1050 * There are hacks in the code to handle the known CJK tailorings of U+4E00. 1051 * 1052 * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a. 1053 * 1054 * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in 1055 * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java 1056 */ 1057 static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] = { 1058 0x41, 0, 0x03B1, 0, 1059 0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0, 1060 0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0, 1061 0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0, 1062 0xAAF2, 0, // Meetei Mayek 1063 0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0, 1064 U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0, // Sharada 1065 U16_LEAD(0x11680), U16_TRAIL(0x11680), 0, // Takri 1066 0x1B83, 0, 1067 0xD802, 0xDE00, 0, 0x0E01, 0, 1068 0x0EDE, 0, // Lao 1069 0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0, 1070 0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0, 1071 U16_LEAD(0x11103), U16_TRAIL(0x11103), 0, // Chakma 1072 0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0, 1073 0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0, 1074 0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0, 1075 U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0, // Miao 1076 0xD800, 0xDE80, 0, 1077 0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0, 1078 0xD801, 0xDC80, 0, 1079 U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0, // Sora Sompeng 1080 0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0, 1081 0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0, 1082 U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0, // Meroitic Cursive 1083 U16_LEAD(0x10980), U16_TRAIL(0x10980), 0, // Meroitic Hieroglyphs 1084 0x4E00, 0, 1085 // TODO: The overflow bucket's lowerBoundary string should be the 1086 // first item after the last reordering group in the collator's script order. 1087 // This should normally be the first Unicode code point 1088 // that is unassigned (U+0378 in Unicode 6.3) and untailored. 1089 // However, at least up to ICU 51 the Hani reordering group includes 1090 // unassigned code points, 1091 // and there is no stable string for the start of the trailing-weights range. 1092 // The only known string that sorts "high" is U+FFFF. 1093 // When ICU separates Hani vs. unassigned reordering groups, we need to fix this, 1094 // and fix relevant test code. 1095 // Ideally, FractionalUCA.txt will have a "script first primary" 1096 // for unassigned code points. 1097 0xFFFF, 0 1098 }; 1099 1100 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) { 1101 if (U_FAILURE(status)) { 1102 return NULL; 1103 } 1104 UVector *dest = new UVector(status); 1105 if (dest == NULL) { 1106 status = U_MEMORY_ALLOCATION_ERROR; 1107 return NULL; 1108 } 1109 dest->setDeleter(uprv_deleteUObject); 1110 const UChar *src = HACK_FIRST_CHARS_IN_SCRIPTS; 1111 const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS); 1112 do { 1113 if (U_FAILURE(status)) { 1114 return dest; 1115 } 1116 UnicodeString *str = new UnicodeString(src, -1); 1117 if (str == NULL) { 1118 status = U_MEMORY_ALLOCATION_ERROR; 1119 return dest; 1120 } 1121 dest->addElement(str, status); 1122 src += str->length() + 1; 1123 } while (src < limit); 1124 return dest; 1125 } 1126 1127 1128 namespace { 1129 1130 /** 1131 * Returns true if one index character string is "better" than the other. 1132 * Shorter NFKD is better, and otherwise NFKD-binary-less-than is 1133 * better, and otherwise binary-less-than is better. 1134 */ 1135 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer, 1136 const UnicodeString &one, const UnicodeString &other) { 1137 // This is called with primary-equal strings, but never with one.equals(other). 1138 UErrorCode status = U_ZERO_ERROR; 1139 UnicodeString n1 = nfkdNormalizer.normalize(one, status); 1140 UnicodeString n2 = nfkdNormalizer.normalize(other, status); 1141 if (U_FAILURE(status)) { return FALSE; } 1142 int32_t result = n1.countChar32() - n2.countChar32(); 1143 if (result != 0) { 1144 return result < 0; 1145 } 1146 result = n1.compareCodePointOrder(n2); 1147 if (result != 0) { 1148 return result < 0; 1149 } 1150 return one.compareCodePointOrder(other) < 0; 1151 } 1152 1153 } // namespace 1154 1155 // 1156 // Constructor & Destructor for AlphabeticIndex::Record 1157 // 1158 // Records are internal only, instances are not directly surfaced in the public API. 1159 // This class is mostly struct-like, with all public fields. 1160 1161 AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data) 1162 : name_(name), data_(data) {} 1163 1164 AlphabeticIndex::Record::~Record() { 1165 } 1166 1167 1168 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) { 1169 if (U_FAILURE(status)) { 1170 return *this; 1171 } 1172 if (inputList_ == NULL) { 1173 inputList_ = new UVector(status); 1174 if (inputList_ == NULL) { 1175 status = U_MEMORY_ALLOCATION_ERROR; 1176 return *this; 1177 } 1178 inputList_->setDeleter(alphaIndex_deleteRecord); 1179 } 1180 Record *r = new Record(name, data); 1181 if (r == NULL) { 1182 status = U_MEMORY_ALLOCATION_ERROR; 1183 return *this; 1184 } 1185 inputList_->addElement(r, status); 1186 clearBuckets(); 1187 //std::string ss; 1188 //std::string ss2; 1189 //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" << 1190 // " sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl; 1191 return *this; 1192 } 1193 1194 1195 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) { 1196 if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) { 1197 inputList_->removeAllElements(); 1198 clearBuckets(); 1199 } 1200 return *this; 1201 } 1202 1203 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) { 1204 initBuckets(status); 1205 if (U_FAILURE(status)) { 1206 return 0; 1207 } 1208 return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status); 1209 } 1210 1211 1212 int32_t AlphabeticIndex::getBucketIndex() const { 1213 return labelsIterIndex_; 1214 } 1215 1216 1217 UBool AlphabeticIndex::nextBucket(UErrorCode &status) { 1218 if (U_FAILURE(status)) { 1219 return FALSE; 1220 } 1221 if (buckets_ == NULL && currentBucket_ != NULL) { 1222 status = U_ENUM_OUT_OF_SYNC_ERROR; 1223 return FALSE; 1224 } 1225 initBuckets(status); 1226 if (U_FAILURE(status)) { 1227 return FALSE; 1228 } 1229 ++labelsIterIndex_; 1230 if (labelsIterIndex_ >= buckets_->getBucketCount()) { 1231 labelsIterIndex_ = buckets_->getBucketCount(); 1232 return FALSE; 1233 } 1234 currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_); 1235 resetRecordIterator(); 1236 return TRUE; 1237 } 1238 1239 const UnicodeString &AlphabeticIndex::getBucketLabel() const { 1240 if (currentBucket_ != NULL) { 1241 return currentBucket_->label_; 1242 } else { 1243 return emptyString_; 1244 } 1245 } 1246 1247 1248 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const { 1249 if (currentBucket_ != NULL) { 1250 return currentBucket_->labelType_; 1251 } else { 1252 return U_ALPHAINDEX_NORMAL; 1253 } 1254 } 1255 1256 1257 int32_t AlphabeticIndex::getBucketRecordCount() const { 1258 if (currentBucket_ != NULL && currentBucket_->records_ != NULL) { 1259 return currentBucket_->records_->size(); 1260 } else { 1261 return 0; 1262 } 1263 } 1264 1265 1266 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) { 1267 if (U_FAILURE(status)) { 1268 return *this; 1269 } 1270 internalResetBucketIterator(); 1271 return *this; 1272 } 1273 1274 1275 UBool AlphabeticIndex::nextRecord(UErrorCode &status) { 1276 if (U_FAILURE(status)) { 1277 return FALSE; 1278 } 1279 if (currentBucket_ == NULL) { 1280 // We are trying to iterate over the items in a bucket, but there is no 1281 // current bucket from the enumeration of buckets. 1282 status = U_INVALID_STATE_ERROR; 1283 return FALSE; 1284 } 1285 if (buckets_ == NULL) { 1286 status = U_ENUM_OUT_OF_SYNC_ERROR; 1287 return FALSE; 1288 } 1289 if (currentBucket_->records_ == NULL) { 1290 return FALSE; 1291 } 1292 ++itemsIterIndex_; 1293 if (itemsIterIndex_ >= currentBucket_->records_->size()) { 1294 itemsIterIndex_ = currentBucket_->records_->size(); 1295 return FALSE; 1296 } 1297 return TRUE; 1298 } 1299 1300 1301 const UnicodeString &AlphabeticIndex::getRecordName() const { 1302 const UnicodeString *retStr = &emptyString_; 1303 if (currentBucket_ != NULL && currentBucket_->records_ != NULL && 1304 itemsIterIndex_ >= 0 && 1305 itemsIterIndex_ < currentBucket_->records_->size()) { 1306 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); 1307 retStr = &item->name_; 1308 } 1309 return *retStr; 1310 } 1311 1312 const void *AlphabeticIndex::getRecordData() const { 1313 const void *retPtr = NULL; 1314 if (currentBucket_ != NULL && currentBucket_->records_ != NULL && 1315 itemsIterIndex_ >= 0 && 1316 itemsIterIndex_ < currentBucket_->records_->size()) { 1317 Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_)); 1318 retPtr = item->data_; 1319 } 1320 return retPtr; 1321 } 1322 1323 1324 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() { 1325 itemsIterIndex_ = -1; 1326 return *this; 1327 } 1328 1329 1330 1331 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label, 1332 const UnicodeString &lowerBoundary, 1333 UAlphabeticIndexLabelType type) 1334 : label_(label), lowerBoundary_(lowerBoundary), labelType_(type), 1335 displayBucket_(NULL), displayIndex_(-1), 1336 records_(NULL) { 1337 } 1338 1339 1340 AlphabeticIndex::Bucket::~Bucket() { 1341 delete records_; 1342 } 1343 1344 U_NAMESPACE_END 1345 1346 #endif 1347