Home | History | Annotate | Download | only in i18n
      1 /*
      2 *******************************************************************************
      3 * Copyright (C) 2009-2013, International Business Machines Corporation and
      4 * others. All Rights Reserved.
      5 *******************************************************************************
      6 */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_NORMALIZATION
     11 
     12 #include "unicode/alphaindex.h"
     13 #include "unicode/coleitr.h"
     14 #include "unicode/coll.h"
     15 #include "unicode/localpointer.h"
     16 #include "unicode/normalizer2.h"
     17 #include "unicode/tblcoll.h"
     18 #include "unicode/ulocdata.h"
     19 #include "unicode/uniset.h"
     20 #include "unicode/uobject.h"
     21 #include "unicode/usetiter.h"
     22 #include "unicode/utf16.h"
     23 
     24 #include "cmemory.h"
     25 #include "cstring.h"
     26 #include "uassert.h"
     27 #include "uvector.h"
     28 
     29 //#include <string>
     30 //#include <iostream>
     31 
     32 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     33 
     34 U_NAMESPACE_BEGIN
     35 
     36 namespace {
     37 
     38 /**
     39  * Prefix string for Chinese index buckets.
     40  * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
     41  */
     42 const UChar BASE[1] = { 0xFDD0 };
     43 const int32_t BASE_LENGTH = 1;
     44 
     45 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
     46                                 const UnicodeString &one, const UnicodeString &other);
     47 
     48 }  // namespace
     49 
     50 static int32_t U_CALLCONV
     51 collatorComparator(const void *context, const void *left, const void *right);
     52 
     53 static int32_t U_CALLCONV
     54 recordCompareFn(const void *context, const void *left, const void *right);
     55 
     56 //  UVector<Record *> support function, delete a Record.
     57 static void U_CALLCONV
     58 alphaIndex_deleteRecord(void *obj) {
     59     delete static_cast<AlphabeticIndex::Record *>(obj);
     60 }
     61 
     62 namespace {
     63 
     64 UnicodeString *ownedString(const UnicodeString &s, LocalPointer<UnicodeString> &owned,
     65                            UErrorCode &errorCode) {
     66     if (U_FAILURE(errorCode)) { return NULL; }
     67     if (owned.isValid()) {
     68         return owned.orphan();
     69     }
     70     UnicodeString *p = new UnicodeString(s);
     71     if (p == NULL) {
     72         errorCode = U_MEMORY_ALLOCATION_ERROR;
     73     }
     74     return p;
     75 }
     76 
     77 inline UnicodeString *getString(const UVector &list, int32_t i) {
     78     return static_cast<UnicodeString *>(list[i]);
     79 }
     80 
     81 inline AlphabeticIndex::Bucket *getBucket(const UVector &list, int32_t i) {
     82     return static_cast<AlphabeticIndex::Bucket *>(list[i]);
     83 }
     84 
     85 inline AlphabeticIndex::Record *getRecord(const UVector &list, int32_t i) {
     86     return static_cast<AlphabeticIndex::Record *>(list[i]);
     87 }
     88 
     89 /**
     90  * Like Java Collections.binarySearch(List, String, Comparator).
     91  *
     92  * @return the index>=0 where the item was found,
     93  *         or the index<0 for inserting the string at ~index in sorted order
     94  */
     95 int32_t binarySearch(const UVector &list, const UnicodeString &s, const Collator &coll) {
     96     if (list.size() == 0) { return ~0; }
     97     int32_t start = 0;
     98     int32_t limit = list.size();
     99     for (;;) {
    100         int32_t i = (start + limit) / 2;
    101         const UnicodeString *si = static_cast<UnicodeString *>(list.elementAt(i));
    102         UErrorCode errorCode = U_ZERO_ERROR;
    103         UCollationResult cmp = coll.compare(s, *si, errorCode);
    104         if (cmp == UCOL_EQUAL) {
    105             return i;
    106         } else if (cmp < 0) {
    107             if (i == start) {
    108                 return ~start;  // insert s before *si
    109             }
    110             limit = i;
    111         } else {
    112             if (i == start) {
    113                 return ~(start + 1);  // insert s after *si
    114             }
    115             start = i;
    116         }
    117     }
    118 }
    119 
    120 }  // namespace
    121 
    122 // The BucketList is not in the anonymous namespace because only Clang
    123 // seems to support its use in other classes from there.
    124 // However, we also don't need U_I18N_API because it is not used from outside the i18n library.
    125 class BucketList : public UObject {
    126 public:
    127     BucketList(UVector *bucketList, UVector *publicBucketList)
    128             : bucketList_(bucketList), immutableVisibleList_(publicBucketList) {
    129         int32_t displayIndex = 0;
    130         for (int32_t i = 0; i < publicBucketList->size(); ++i) {
    131             getBucket(*publicBucketList, i)->displayIndex_ = displayIndex++;
    132         }
    133     }
    134 
    135     virtual ~BucketList() {
    136         delete bucketList_;
    137         if (immutableVisibleList_ != bucketList_) {
    138             delete immutableVisibleList_;
    139         }
    140     }
    141 
    142     int32_t getBucketCount() const {
    143         return immutableVisibleList_->size();
    144     }
    145 
    146     int32_t getBucketIndex(const UnicodeString &name, const Collator &collatorPrimaryOnly,
    147                            UErrorCode &errorCode) {
    148         // binary search
    149         int32_t start = 0;
    150         int32_t limit = bucketList_->size();
    151         while ((start + 1) < limit) {
    152             int32_t i = (start + limit) / 2;
    153             const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, i);
    154             UCollationResult nameVsBucket =
    155                 collatorPrimaryOnly.compare(name, bucket->lowerBoundary_, errorCode);
    156             if (nameVsBucket < 0) {
    157                 limit = i;
    158             } else {
    159                 start = i;
    160             }
    161         }
    162         const AlphabeticIndex::Bucket *bucket = getBucket(*bucketList_, start);
    163         if (bucket->displayBucket_ != NULL) {
    164             bucket = bucket->displayBucket_;
    165         }
    166         return bucket->displayIndex_;
    167     }
    168 
    169     /** All of the buckets, visible and invisible. */
    170     UVector *bucketList_;
    171     /** Just the visible buckets. */
    172     UVector *immutableVisibleList_;
    173 };
    174 
    175 AlphabeticIndex::ImmutableIndex::~ImmutableIndex() {
    176     delete buckets_;
    177     delete collatorPrimaryOnly_;
    178 }
    179 
    180 int32_t
    181 AlphabeticIndex::ImmutableIndex::getBucketCount() const {
    182     return buckets_->getBucketCount();
    183 }
    184 
    185 int32_t
    186 AlphabeticIndex::ImmutableIndex::getBucketIndex(
    187         const UnicodeString &name, UErrorCode &errorCode) const {
    188     return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, errorCode);
    189 }
    190 
    191 const AlphabeticIndex::Bucket *
    192 AlphabeticIndex::ImmutableIndex::getBucket(int32_t index) const {
    193     if (0 <= index && index < buckets_->getBucketCount()) {
    194         return icu::getBucket(*buckets_->immutableVisibleList_, index);
    195     } else {
    196         return NULL;
    197     }
    198 }
    199 
    200 AlphabeticIndex::AlphabeticIndex(const Locale &locale, UErrorCode &status)
    201         : inputList_(NULL),
    202           labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
    203           maxLabelCount_(99),
    204           initialLabels_(NULL), firstCharsInScripts_(NULL),
    205           collator_(NULL), collatorPrimaryOnly_(NULL),
    206           buckets_(NULL) {
    207     init(&locale, status);
    208 }
    209 
    210 
    211 AlphabeticIndex::AlphabeticIndex(RuleBasedCollator *collator, UErrorCode &status)
    212         : inputList_(NULL),
    213           labelsIterIndex_(-1), itemsIterIndex_(0), currentBucket_(NULL),
    214           maxLabelCount_(99),
    215           initialLabels_(NULL), firstCharsInScripts_(NULL),
    216           collator_(collator), collatorPrimaryOnly_(NULL),
    217           buckets_(NULL) {
    218     init(NULL, status);
    219 }
    220 
    221 
    222 
    223 AlphabeticIndex::~AlphabeticIndex() {
    224     delete collator_;
    225     delete collatorPrimaryOnly_;
    226     delete firstCharsInScripts_;
    227     delete buckets_;
    228     delete inputList_;
    229     delete initialLabels_;
    230 }
    231 
    232 
    233 AlphabeticIndex &AlphabeticIndex::addLabels(const UnicodeSet &additions, UErrorCode &status) {
    234     if (U_FAILURE(status)) {
    235         return *this;
    236     }
    237     initialLabels_->addAll(additions);
    238     clearBuckets();
    239     return *this;
    240 }
    241 
    242 
    243 AlphabeticIndex &AlphabeticIndex::addLabels(const Locale &locale, UErrorCode &status) {
    244     addIndexExemplars(locale, status);
    245     clearBuckets();
    246     return *this;
    247 }
    248 
    249 
    250 AlphabeticIndex::ImmutableIndex *AlphabeticIndex::buildImmutableIndex(UErrorCode &errorCode) {
    251     if (U_FAILURE(errorCode)) { return NULL; }
    252     // In C++, the ImmutableIndex must own its copy of the BucketList,
    253     // even if it contains no records, for proper memory management.
    254     // We could clone the buckets_ if they are not NULL,
    255     // but that would be worth it only if this method is called multiple times,
    256     // or called after using the old-style bucket iterator API.
    257     LocalPointer<BucketList> immutableBucketList(createBucketList(errorCode));
    258     LocalPointer<RuleBasedCollator> coll(
    259         static_cast<RuleBasedCollator *>(collatorPrimaryOnly_->clone()));
    260     if (immutableBucketList.isNull() || coll.isNull()) {
    261         errorCode = U_MEMORY_ALLOCATION_ERROR;
    262         return NULL;
    263     }
    264     ImmutableIndex *immIndex = new ImmutableIndex(immutableBucketList.getAlias(), coll.getAlias());
    265     if (immIndex == NULL) {
    266         errorCode = U_MEMORY_ALLOCATION_ERROR;
    267         return NULL;
    268     }
    269     // The ImmutableIndex adopted its parameter objects.
    270     immutableBucketList.orphan();
    271     coll.orphan();
    272     return immIndex;
    273 }
    274 
    275 int32_t AlphabeticIndex::getBucketCount(UErrorCode &status) {
    276     initBuckets(status);
    277     if (U_FAILURE(status)) {
    278         return 0;
    279     }
    280     return buckets_->getBucketCount();
    281 }
    282 
    283 
    284 int32_t AlphabeticIndex::getRecordCount(UErrorCode &status) {
    285     if (U_FAILURE(status) || inputList_ == NULL) {
    286         return 0;
    287     }
    288     return inputList_->size();
    289 }
    290 
    291 void AlphabeticIndex::initLabels(UVector &indexCharacters, UErrorCode &errorCode) const {
    292     const Normalizer2 *nfkdNormalizer = Normalizer2::getNFKDInstance(errorCode);
    293     if (U_FAILURE(errorCode)) { return; }
    294 
    295     const UnicodeString &firstScriptBoundary = *getString(*firstCharsInScripts_, 0);
    296     const UnicodeString &overflowBoundary =
    297         *getString(*firstCharsInScripts_, firstCharsInScripts_->size() - 1);
    298 
    299     // We make a sorted array of elements.
    300     // Some of the input may be redundant.
    301     // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
    302     // We filter out those cases.
    303     UnicodeSetIterator iter(*initialLabels_);
    304     while (iter.next()) {
    305         const UnicodeString *item = &iter.getString();
    306         LocalPointer<UnicodeString> ownedItem;
    307         UBool checkDistinct;
    308         int32_t itemLength = item->length();
    309         if (!item->hasMoreChar32Than(0, itemLength, 1)) {
    310             checkDistinct = FALSE;
    311         } else if(item->charAt(itemLength - 1) == 0x2a &&  // '*'
    312                 item->charAt(itemLength - 2) != 0x2a) {
    313             // Use a label if it is marked with one trailing star,
    314             // even if the label string sorts the same when all contractions are suppressed.
    315             ownedItem.adoptInstead(new UnicodeString(*item, 0, itemLength - 1));
    316             item = ownedItem.getAlias();
    317             if (item == NULL) {
    318                 errorCode = U_MEMORY_ALLOCATION_ERROR;
    319                 return;
    320             }
    321             checkDistinct = FALSE;
    322         } else {
    323             checkDistinct = TRUE;
    324         }
    325         if (collatorPrimaryOnly_->compare(*item, firstScriptBoundary, errorCode) < 0) {
    326             // Ignore a primary-ignorable or non-alphabetic index character.
    327         } else if (collatorPrimaryOnly_->compare(*item, overflowBoundary, errorCode) >= 0) {
    328             // Ignore an index characters that will land in the overflow bucket.
    329         } else if (checkDistinct &&
    330                 collatorPrimaryOnly_->compare(*item, separated(*item), errorCode) == 0) {
    331             // Ignore a multi-code point index character that does not sort distinctly
    332             // from the sequence of its separate characters.
    333         } else {
    334             int32_t insertionPoint = binarySearch(indexCharacters, *item, *collatorPrimaryOnly_);
    335             if (insertionPoint < 0) {
    336                 indexCharacters.insertElementAt(
    337                     ownedString(*item, ownedItem, errorCode), ~insertionPoint, errorCode);
    338             } else {
    339                 const UnicodeString &itemAlreadyIn = *getString(indexCharacters, insertionPoint);
    340                 if (isOneLabelBetterThanOther(*nfkdNormalizer, *item, itemAlreadyIn)) {
    341                     indexCharacters.setElementAt(
    342                         ownedString(*item, ownedItem, errorCode), insertionPoint);
    343                 }
    344             }
    345         }
    346     }
    347     if (U_FAILURE(errorCode)) { return; }
    348 
    349     // if the result is still too large, cut down to maxCount elements, by removing every nth element
    350 
    351     int32_t size = indexCharacters.size() - 1;
    352     if (size > maxLabelCount_) {
    353         int32_t count = 0;
    354         int32_t old = -1;
    355         for (int32_t i = 0; i < indexCharacters.size();) {
    356             ++count;
    357             int32_t bump = count * maxLabelCount_ / size;
    358             if (bump == old) {
    359                 indexCharacters.removeElementAt(i);
    360             } else {
    361                 old = bump;
    362                 ++i;
    363             }
    364         }
    365     }
    366 }
    367 
    368 namespace {
    369 
    370 const UnicodeString &fixLabel(const UnicodeString &current, UnicodeString &temp) {
    371     if (!current.startsWith(BASE, BASE_LENGTH)) {
    372         return current;
    373     }
    374     UChar rest = current.charAt(BASE_LENGTH);
    375     if (0x2800 < rest && rest <= 0x28FF) { // stroke count
    376         int32_t count = rest-0x2800;
    377         temp.setTo((UChar)(0x30 + count % 10));
    378         if (count >= 10) {
    379             count /= 10;
    380             temp.insert(0, (UChar)(0x30 + count % 10));
    381             if (count >= 10) {
    382                 count /= 10;
    383                 temp.insert(0, (UChar)(0x30 + count));
    384             }
    385         }
    386         return temp.append((UChar)0x5283);
    387     }
    388     return temp.setTo(current, BASE_LENGTH);
    389 }
    390 
    391 UBool hasMultiplePrimaryWeights(
    392         CollationElementIterator &cei, int32_t variableTop,
    393         const UnicodeString &s, UErrorCode &errorCode) {
    394     cei.setText(s, errorCode);
    395     UBool seenPrimary = FALSE;
    396     for (;;) {
    397         int32_t ce32 = cei.next(errorCode);
    398         if (ce32 == CollationElementIterator::NULLORDER) {
    399             break;
    400         }
    401         int32_t p = CollationElementIterator::primaryOrder(ce32);
    402         if (p > variableTop && (ce32 & 0xc0) != 0xc0) {
    403             // not primary ignorable, and not a continuation CE
    404             if (seenPrimary) {
    405                 return TRUE;
    406             }
    407             seenPrimary = TRUE;
    408         }
    409     }
    410     return FALSE;
    411 }
    412 
    413 }  // namespace
    414 
    415 BucketList *AlphabeticIndex::createBucketList(UErrorCode &errorCode) const {
    416     // Initialize indexCharacters.
    417     UVector indexCharacters(errorCode);
    418     indexCharacters.setDeleter(uprv_deleteUObject);
    419     initLabels(indexCharacters, errorCode);
    420     if (U_FAILURE(errorCode)) { return NULL; }
    421 
    422     // Variables for hasMultiplePrimaryWeights().
    423     LocalPointer<CollationElementIterator> cei(
    424         collatorPrimaryOnly_->createCollationElementIterator(emptyString_));
    425     if (cei.isNull()) {
    426         errorCode = U_MEMORY_ALLOCATION_ERROR;
    427         return NULL;
    428     }
    429     int32_t variableTop;
    430     if (collatorPrimaryOnly_->getAttribute(UCOL_ALTERNATE_HANDLING, errorCode) == UCOL_SHIFTED) {
    431         variableTop = CollationElementIterator::primaryOrder(
    432             (int32_t)collatorPrimaryOnly_->getVariableTop(errorCode));
    433     } else {
    434         variableTop = 0;
    435     }
    436     UBool hasInvisibleBuckets = FALSE;
    437 
    438     // Helper arrays for Chinese Pinyin collation.
    439     Bucket *asciiBuckets[26] = {
    440         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    441         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
    442     };
    443     Bucket *pinyinBuckets[26] = {
    444         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
    445         NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
    446     };
    447     UBool hasPinyin = FALSE;
    448 
    449     LocalPointer<UVector> bucketList(new UVector(errorCode));
    450     if (bucketList.isNull()) {
    451         errorCode = U_MEMORY_ALLOCATION_ERROR;
    452         return NULL;
    453     }
    454     bucketList->setDeleter(uprv_deleteUObject);
    455 
    456     // underflow bucket
    457     Bucket *bucket = new Bucket(getUnderflowLabel(), emptyString_, U_ALPHAINDEX_UNDERFLOW);
    458     if (bucket == NULL) {
    459         errorCode = U_MEMORY_ALLOCATION_ERROR;
    460         return NULL;
    461     }
    462     bucketList->addElement(bucket, errorCode);
    463     if (U_FAILURE(errorCode)) { return NULL; }
    464 
    465     UnicodeString temp;
    466 
    467     // fix up the list, adding underflow, additions, overflow
    468     // Insert inflow labels as needed.
    469     int32_t scriptIndex = -1;
    470     const UnicodeString *scriptUpperBoundary = &emptyString_;
    471     for (int32_t i = 0; i < indexCharacters.size(); ++i) {
    472         UnicodeString &current = *getString(indexCharacters, i);
    473         if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) >= 0) {
    474             // We crossed the script boundary into a new script.
    475             const UnicodeString &inflowBoundary = *scriptUpperBoundary;
    476             UBool skippedScript = FALSE;
    477             for (;;) {
    478                 scriptUpperBoundary = getString(*firstCharsInScripts_, ++scriptIndex);
    479                 if (collatorPrimaryOnly_->compare(current, *scriptUpperBoundary, errorCode) < 0) {
    480                     break;
    481                 }
    482                 skippedScript = TRUE;
    483             }
    484             if (skippedScript && bucketList->size() > 1) {
    485                 // We are skipping one or more scripts,
    486                 // and we are not just getting out of the underflow label.
    487                 bucket = new Bucket(getInflowLabel(), inflowBoundary, U_ALPHAINDEX_INFLOW);
    488                 if (bucket == NULL) {
    489                     errorCode = U_MEMORY_ALLOCATION_ERROR;
    490                     return NULL;
    491                 }
    492                 bucketList->addElement(bucket, errorCode);
    493             }
    494         }
    495         // Add a bucket with the current label.
    496         bucket = new Bucket(fixLabel(current, temp), current, U_ALPHAINDEX_NORMAL);
    497         if (bucket == NULL) {
    498             errorCode = U_MEMORY_ALLOCATION_ERROR;
    499             return NULL;
    500         }
    501         bucketList->addElement(bucket, errorCode);
    502         // Remember ASCII and Pinyin buckets for Pinyin redirects.
    503         UChar c;
    504         if (current.length() == 1 && 0x41 <= (c = current.charAt(0)) && c <= 0x5A) {  // A-Z
    505             asciiBuckets[c - 0x41] = bucket;
    506         } else if (current.length() == BASE_LENGTH + 1 && current.startsWith(BASE, BASE_LENGTH) &&
    507                 0x41 <= (c = current.charAt(BASE_LENGTH)) && c <= 0x5A) {
    508             pinyinBuckets[c - 0x41] = bucket;
    509             hasPinyin = TRUE;
    510         }
    511         // Check for multiple primary weights.
    512         if (!current.startsWith(BASE, BASE_LENGTH) &&
    513                 hasMultiplePrimaryWeights(*cei, variableTop, current, errorCode) &&
    514                 current.charAt(current.length() - 1) != 0xFFFF /* !current.endsWith("\uffff") */) {
    515             // "AE-ligature" or "Sch" etc.
    516             for (int32_t i = bucketList->size() - 2;; --i) {
    517                 Bucket *singleBucket = getBucket(*bucketList, i);
    518                 if (singleBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
    519                     // There is no single-character bucket since the last
    520                     // underflow or inflow label.
    521                     break;
    522                 }
    523                 if (singleBucket->displayBucket_ == NULL &&
    524                         !hasMultiplePrimaryWeights(
    525                             *cei, variableTop, singleBucket->lowerBoundary_, errorCode)) {
    526                     // Add an invisible bucket that redirects strings greater than the expansion
    527                     // to the previous single-character bucket.
    528                     // For example, after ... Q R S Sch we add Sch\uFFFF->S
    529                     // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
    530                     bucket = new Bucket(emptyString_,
    531                         UnicodeString(current).append((UChar)0xFFFF),
    532                         U_ALPHAINDEX_NORMAL);
    533                     if (bucket == NULL) {
    534                         errorCode = U_MEMORY_ALLOCATION_ERROR;
    535                         return NULL;
    536                     }
    537                     bucket->displayBucket_ = singleBucket;
    538                     bucketList->addElement(bucket, errorCode);
    539                     hasInvisibleBuckets = TRUE;
    540                     break;
    541                 }
    542             }
    543         }
    544     }
    545     if (U_FAILURE(errorCode)) { return NULL; }
    546     if (bucketList->size() == 1) {
    547         // No real labels, show only the underflow label.
    548         BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
    549         if (bl == NULL) {
    550             errorCode = U_MEMORY_ALLOCATION_ERROR;
    551             return NULL;
    552         }
    553         bucketList.orphan();
    554         return bl;
    555     }
    556     // overflow bucket
    557     bucket = new Bucket(getOverflowLabel(), *scriptUpperBoundary, U_ALPHAINDEX_OVERFLOW);
    558     if (bucket == NULL) {
    559         errorCode = U_MEMORY_ALLOCATION_ERROR;
    560         return NULL;
    561     }
    562     bucketList->addElement(bucket, errorCode); // final
    563 
    564     if (hasPinyin) {
    565         // Redirect Pinyin buckets.
    566         Bucket *asciiBucket = NULL;
    567         for (int32_t i = 0; i < 26; ++i) {
    568             if (asciiBuckets[i] != NULL) {
    569                 asciiBucket = asciiBuckets[i];
    570             }
    571             if (pinyinBuckets[i] != NULL && asciiBucket != NULL) {
    572                 pinyinBuckets[i]->displayBucket_ = asciiBucket;
    573                 hasInvisibleBuckets = TRUE;
    574             }
    575         }
    576     }
    577 
    578     if (U_FAILURE(errorCode)) { return NULL; }
    579     if (!hasInvisibleBuckets) {
    580         BucketList *bl = new BucketList(bucketList.getAlias(), bucketList.getAlias());
    581         if (bl == NULL) {
    582             errorCode = U_MEMORY_ALLOCATION_ERROR;
    583             return NULL;
    584         }
    585         bucketList.orphan();
    586         return bl;
    587     }
    588     // Merge inflow buckets that are visually adjacent.
    589     // Iterate backwards: Merge inflow into overflow rather than the other way around.
    590     int32_t i = bucketList->size() - 1;
    591     Bucket *nextBucket = getBucket(*bucketList, i);
    592     while (--i > 0) {
    593         bucket = getBucket(*bucketList, i);
    594         if (bucket->displayBucket_ != NULL) {
    595             continue;  // skip invisible buckets
    596         }
    597         if (bucket->labelType_ == U_ALPHAINDEX_INFLOW) {
    598             if (nextBucket->labelType_ != U_ALPHAINDEX_NORMAL) {
    599                 bucket->displayBucket_ = nextBucket;
    600                 continue;
    601             }
    602         }
    603         nextBucket = bucket;
    604     }
    605 
    606     LocalPointer<UVector> publicBucketList(new UVector(errorCode));
    607     if (bucketList.isNull()) {
    608         errorCode = U_MEMORY_ALLOCATION_ERROR;
    609         return NULL;
    610     }
    611     // Do not call publicBucketList->setDeleter():
    612     // This vector shares its objects with the bucketList.
    613     for (int32_t i = 0; i < bucketList->size(); ++i) {
    614         bucket = getBucket(*bucketList, i);
    615         if (bucket->displayBucket_ == NULL) {
    616             publicBucketList->addElement(bucket, errorCode);
    617         }
    618     }
    619     if (U_FAILURE(errorCode)) { return NULL; }
    620     BucketList *bl = new BucketList(bucketList.getAlias(), publicBucketList.getAlias());
    621     if (bl == NULL) {
    622         errorCode = U_MEMORY_ALLOCATION_ERROR;
    623         return NULL;
    624     }
    625     bucketList.orphan();
    626     publicBucketList.orphan();
    627     return bl;
    628 }
    629 
    630 /**
    631  * Creates an index, and buckets and sorts the list of records into the index.
    632  */
    633 void AlphabeticIndex::initBuckets(UErrorCode &errorCode) {
    634     if (U_FAILURE(errorCode) || buckets_ != NULL) {
    635         return;
    636     }
    637     buckets_ = createBucketList(errorCode);
    638     if (U_FAILURE(errorCode) || inputList_ == NULL || inputList_->isEmpty()) {
    639         return;
    640     }
    641 
    642     // Sort the records by name.
    643     // Stable sort preserves input order of collation duplicates.
    644     inputList_->sortWithUComparator(recordCompareFn, collator_, errorCode);
    645 
    646     // Now, we traverse all of the input, which is now sorted.
    647     // If the item doesn't go in the current bucket, we find the next bucket that contains it.
    648     // This makes the process order n*log(n), since we just sort the list and then do a linear process.
    649     // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
    650     // we need to improve it for that case.
    651 
    652     Bucket *currentBucket = getBucket(*buckets_->bucketList_, 0);
    653     int32_t bucketIndex = 1;
    654     Bucket *nextBucket;
    655     const UnicodeString *upperBoundary;
    656     if (bucketIndex < buckets_->bucketList_->size()) {
    657         nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
    658         upperBoundary = &nextBucket->lowerBoundary_;
    659     } else {
    660         nextBucket = NULL;
    661         upperBoundary = NULL;
    662     }
    663     for (int32_t i = 0; i < inputList_->size(); ++i) {
    664         Record *r = getRecord(*inputList_, i);
    665         // if the current bucket isn't the right one, find the one that is
    666         // We have a special flag for the last bucket so that we don't look any further
    667         while (upperBoundary != NULL &&
    668                 collatorPrimaryOnly_->compare(r->name_, *upperBoundary, errorCode) >= 0) {
    669             currentBucket = nextBucket;
    670             // now reset the boundary that we compare against
    671             if (bucketIndex < buckets_->bucketList_->size()) {
    672                 nextBucket = getBucket(*buckets_->bucketList_, bucketIndex++);
    673                 upperBoundary = &nextBucket->lowerBoundary_;
    674             } else {
    675                 upperBoundary = NULL;
    676             }
    677         }
    678         // now put the record into the bucket.
    679         Bucket *bucket = currentBucket;
    680         if (bucket->displayBucket_ != NULL) {
    681             bucket = bucket->displayBucket_;
    682         }
    683         if (bucket->records_ == NULL) {
    684             bucket->records_ = new UVector(errorCode);
    685             if (bucket->records_ == NULL) {
    686                 errorCode = U_MEMORY_ALLOCATION_ERROR;
    687                 return;
    688             }
    689         }
    690         bucket->records_->addElement(r, errorCode);
    691     }
    692 }
    693 
    694 void AlphabeticIndex::clearBuckets() {
    695     if (buckets_ != NULL) {
    696         delete buckets_;
    697         buckets_ = NULL;
    698         internalResetBucketIterator();
    699     }
    700 }
    701 
    702 void AlphabeticIndex::internalResetBucketIterator() {
    703     labelsIterIndex_ = -1;
    704     currentBucket_ = NULL;
    705 }
    706 
    707 
    708 void AlphabeticIndex::addIndexExemplars(const Locale &locale, UErrorCode &status) {
    709     if (U_FAILURE(status)) { return; }
    710     // Chinese index characters, which are specific to each of the several Chinese tailorings,
    711     // take precedence over the single locale data exemplar set per language.
    712     const char *language = locale.getLanguage();
    713     if (uprv_strcmp(language, "zh") == 0 || uprv_strcmp(language, "ja") == 0 ||
    714             uprv_strcmp(language, "ko") == 0) {
    715         // TODO: This should be done regardless of the language, but it's expensive.
    716         // We should add a Collator function (can be @internal)
    717         // to enumerate just the contractions that start with a given code point or string.
    718         if (addChineseIndexCharacters(status) || U_FAILURE(status)) {
    719             return;
    720         }
    721     }
    722 
    723     LocalULocaleDataPointer uld(ulocdata_open(locale.getName(), &status));
    724     if (U_FAILURE(status)) {
    725         return;
    726     }
    727 
    728     UnicodeSet exemplars;
    729     ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_INDEX, &status);
    730     if (U_SUCCESS(status)) {
    731         initialLabels_->addAll(exemplars);
    732         return;
    733     }
    734     status = U_ZERO_ERROR;  // Clear out U_MISSING_RESOURCE_ERROR
    735 
    736     // The locale data did not include explicit Index characters.
    737     // Synthesize a set of them from the locale's standard exemplar characters.
    738     ulocdata_getExemplarSet(uld.getAlias(), exemplars.toUSet(), 0, ULOCDATA_ES_STANDARD, &status);
    739     if (U_FAILURE(status)) {
    740         return;
    741     }
    742 
    743     // question: should we add auxiliary exemplars?
    744     if (exemplars.containsSome(0x61, 0x7A) /* a-z */ || exemplars.size() == 0) {
    745         exemplars.add(0x61, 0x7A);
    746     }
    747     if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
    748         // cut down to small list
    749         exemplars.remove(0xAC00, 0xD7A3).
    750             add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
    751             add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
    752             add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
    753             add(0xD30C).add(0xD558);
    754     }
    755     if (exemplars.containsSome(0x1200, 0x137F)) {  // Ethiopic block
    756         // cut down to small list
    757         // make use of the fact that Ethiopic is allocated in 8's, where
    758         // the base is 0 mod 8.
    759         UnicodeSet ethiopic(
    760             UNICODE_STRING_SIMPLE("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]"), status);
    761         UnicodeSetIterator it(ethiopic);
    762         while (it.next() && !it.isString()) {
    763             if ((it.getCodepoint() & 0x7) != 0) {
    764                 exemplars.remove(it.getCodepoint());
    765             }
    766         }
    767     }
    768 
    769     // Upper-case any that aren't already so.
    770     //   (We only do this for synthesized index characters.)
    771     UnicodeSetIterator it(exemplars);
    772     UnicodeString upperC;
    773     while (it.next()) {
    774         const UnicodeString &exemplarC = it.getString();
    775         upperC = exemplarC;
    776         upperC.toUpper(locale);
    777         initialLabels_->add(upperC);
    778     }
    779 }
    780 
    781 UBool AlphabeticIndex::addChineseIndexCharacters(UErrorCode &errorCode) {
    782     UnicodeSet contractions;
    783     ucol_getContractionsAndExpansions(collatorPrimaryOnly_->getUCollator(),
    784                                       contractions.toUSet(), NULL, FALSE, &errorCode);
    785     if (U_FAILURE(errorCode)) { return FALSE; }
    786     UnicodeString firstHanBoundary;
    787     UBool hasPinyin = FALSE;
    788     UnicodeSetIterator iter(contractions);
    789     while (iter.next()) {
    790         const UnicodeString &s = iter.getString();
    791         if (s.startsWith(BASE, BASE_LENGTH)) {
    792             initialLabels_->add(s);
    793             if (firstHanBoundary.isEmpty() ||
    794                     collatorPrimaryOnly_->compare(s, firstHanBoundary, errorCode) < 0) {
    795                 firstHanBoundary = s;
    796             }
    797             UChar c = s.charAt(s.length() - 1);
    798             if (0x41 <= c && c <= 0x5A) {  // A-Z
    799                 hasPinyin = TRUE;
    800             }
    801         }
    802     }
    803     if (hasPinyin) {
    804         initialLabels_->add(0x41, 0x5A);  // A-Z
    805     }
    806     if (!firstHanBoundary.isEmpty()) {
    807         // The hardcoded list of script boundaries includes U+4E00
    808         // which is tailored to not be the first primary
    809         // in all Chinese tailorings except "unihan".
    810         // Replace U+4E00 with the first boundary string from the tailoring.
    811         // TODO: This becomes obsolete when the root collator gets
    812         // reliable script-first-primary mappings.
    813         int32_t hanIndex = binarySearch(
    814                 *firstCharsInScripts_, UnicodeString((UChar)0x4E00), *collatorPrimaryOnly_);
    815         if (hanIndex >= 0) {
    816             UnicodeString *fh = new UnicodeString(firstHanBoundary);
    817             if (fh == NULL) {
    818                 errorCode = U_MEMORY_ALLOCATION_ERROR;
    819                 return FALSE;
    820             }
    821             firstCharsInScripts_->setElementAt(fh, hanIndex);
    822         }
    823         return TRUE;
    824     } else {
    825         return FALSE;
    826     }
    827 }
    828 
    829 
    830 /*
    831  * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
    832  */
    833 static const UChar CGJ = 0x034F;
    834 UnicodeString AlphabeticIndex::separated(const UnicodeString &item) {
    835     UnicodeString result;
    836     if (item.length() == 0) {
    837         return result;
    838     }
    839     int32_t i = 0;
    840     for (;;) {
    841         UChar32  cp = item.char32At(i);
    842         result.append(cp);
    843         i = item.moveIndex32(i, 1);
    844         if (i >= item.length()) {
    845             break;
    846         }
    847         result.append(CGJ);
    848     }
    849     return result;
    850 }
    851 
    852 
    853 UBool AlphabeticIndex::operator==(const AlphabeticIndex& /* other */) const {
    854     return FALSE;
    855 }
    856 
    857 
    858 UBool AlphabeticIndex::operator!=(const AlphabeticIndex& /* other */) const {
    859     return FALSE;
    860 }
    861 
    862 
    863 const RuleBasedCollator &AlphabeticIndex::getCollator() const {
    864     // There are no known non-RuleBasedCollator collators, and none ever expected.
    865     // But, in case that changes, better a null pointer than a wrong type.
    866     return *dynamic_cast<RuleBasedCollator *>(collator_);
    867 }
    868 
    869 
    870 const UnicodeString &AlphabeticIndex::getInflowLabel() const {
    871     return inflowLabel_;
    872 }
    873 
    874 const UnicodeString &AlphabeticIndex::getOverflowLabel() const {
    875     return overflowLabel_;
    876 }
    877 
    878 
    879 const UnicodeString &AlphabeticIndex::getUnderflowLabel() const {
    880     return underflowLabel_;
    881 }
    882 
    883 
    884 AlphabeticIndex &AlphabeticIndex::setInflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
    885     inflowLabel_ = label;
    886     clearBuckets();
    887     return *this;
    888 }
    889 
    890 
    891 AlphabeticIndex &AlphabeticIndex::setOverflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
    892     overflowLabel_ = label;
    893     clearBuckets();
    894     return *this;
    895 }
    896 
    897 
    898 AlphabeticIndex &AlphabeticIndex::setUnderflowLabel(const UnicodeString &label, UErrorCode &/*status*/) {
    899     underflowLabel_ = label;
    900     clearBuckets();
    901     return *this;
    902 }
    903 
    904 
    905 int32_t AlphabeticIndex::getMaxLabelCount() const {
    906     return maxLabelCount_;
    907 }
    908 
    909 
    910 AlphabeticIndex &AlphabeticIndex::setMaxLabelCount(int32_t maxLabelCount, UErrorCode &status) {
    911     if (U_FAILURE(status)) {
    912         return *this;
    913     }
    914     if (maxLabelCount <= 0) {
    915         status = U_ILLEGAL_ARGUMENT_ERROR;
    916         return *this;
    917     }
    918     maxLabelCount_ = maxLabelCount;
    919     clearBuckets();
    920     return *this;
    921 }
    922 
    923 
    924 //
    925 //  init() - Common code for constructors.
    926 //
    927 
    928 void AlphabeticIndex::init(const Locale *locale, UErrorCode &status) {
    929     if (U_FAILURE(status)) { return; }
    930     if (locale == NULL && collator_ == NULL) {
    931         status = U_ILLEGAL_ARGUMENT_ERROR;
    932         return;
    933     }
    934 
    935     initialLabels_         = new UnicodeSet();
    936     if (initialLabels_ == NULL) {
    937         status = U_MEMORY_ALLOCATION_ERROR;
    938         return;
    939     }
    940 
    941     inflowLabel_.setTo((UChar)0x2026);    // Ellipsis
    942     overflowLabel_ = inflowLabel_;
    943     underflowLabel_ = inflowLabel_;
    944 
    945     if (collator_ == NULL) {
    946         collator_ = static_cast<RuleBasedCollator *>(Collator::createInstance(*locale, status));
    947         if (U_FAILURE(status)) { return; }
    948         if (collator_ == NULL) {
    949             status = U_MEMORY_ALLOCATION_ERROR;
    950             return;
    951         }
    952     }
    953     collatorPrimaryOnly_ = static_cast<RuleBasedCollator *>(collator_->clone());
    954     if (collatorPrimaryOnly_ == NULL) {
    955         status = U_MEMORY_ALLOCATION_ERROR;
    956         return;
    957     }
    958     collatorPrimaryOnly_->setAttribute(UCOL_STRENGTH, UCOL_PRIMARY, status);
    959     firstCharsInScripts_ = firstStringsInScript(status);
    960     if (U_FAILURE(status)) { return; }
    961     firstCharsInScripts_->sortWithUComparator(collatorComparator, collatorPrimaryOnly_, status);
    962     UnicodeString _4E00((UChar)0x4E00);
    963     UnicodeString _1100((UChar)0x1100);
    964     UnicodeString _1112((UChar)0x1112);
    965     if (collatorPrimaryOnly_->compare(_4E00, _1112, status) <= 0 &&
    966             collatorPrimaryOnly_->compare(_1100, _4E00, status) <= 0) {
    967         // The standard Korean tailoring sorts Hanja (Han characters)
    968         // as secondary differences from Hangul syllables.
    969         // This makes U+4E00 not useful as a Han-script boundary.
    970         // TODO: This becomes obsolete when the root collator gets
    971         // reliable script-first-primary mappings.
    972         int32_t hanIndex = binarySearch(
    973                 *firstCharsInScripts_, _4E00, *collatorPrimaryOnly_);
    974         if (hanIndex >= 0) {
    975             firstCharsInScripts_->removeElementAt(hanIndex);
    976         }
    977     }
    978     // Guard against a degenerate collator where
    979     // some script boundary strings are primary ignorable.
    980     for (;;) {
    981         if (U_FAILURE(status)) { return; }
    982         if (firstCharsInScripts_->isEmpty()) {
    983             // AlphabeticIndex requires some non-ignorable script boundary strings.
    984             status = U_ILLEGAL_ARGUMENT_ERROR;
    985             return;
    986         }
    987         if (collatorPrimaryOnly_->compare(
    988                 *static_cast<UnicodeString *>(firstCharsInScripts_->elementAt(0)),
    989                 emptyString_, status) == UCOL_EQUAL) {
    990             firstCharsInScripts_->removeElementAt(0);
    991         } else {
    992             break;
    993         }
    994     }
    995 
    996     if (locale != NULL) {
    997         addIndexExemplars(*locale, status);
    998     }
    999 }
   1000 
   1001 
   1002 //
   1003 //  Comparison function for UVector<UnicodeString *> sorting with a collator.
   1004 //
   1005 static int32_t U_CALLCONV
   1006 collatorComparator(const void *context, const void *left, const void *right) {
   1007     const UElement *leftElement = static_cast<const UElement *>(left);
   1008     const UElement *rightElement = static_cast<const UElement *>(right);
   1009     const UnicodeString *leftString  = static_cast<const UnicodeString *>(leftElement->pointer);
   1010     const UnicodeString *rightString = static_cast<const UnicodeString *>(rightElement->pointer);
   1011 
   1012     if (leftString == rightString) {
   1013         // Catches case where both are NULL
   1014         return 0;
   1015     }
   1016     if (leftString == NULL) {
   1017         return 1;
   1018     };
   1019     if (rightString == NULL) {
   1020         return -1;
   1021     }
   1022     const Collator *col = static_cast<const Collator *>(context);
   1023     UErrorCode errorCode = U_ZERO_ERROR;
   1024     return col->compare(*leftString, *rightString, errorCode);
   1025 }
   1026 
   1027 //
   1028 //  Comparison function for UVector<Record *> sorting with a collator.
   1029 //
   1030 static int32_t U_CALLCONV
   1031 recordCompareFn(const void *context, const void *left, const void *right) {
   1032     const UElement *leftElement = static_cast<const UElement *>(left);
   1033     const UElement *rightElement = static_cast<const UElement *>(right);
   1034     const AlphabeticIndex::Record *leftRec  = static_cast<const AlphabeticIndex::Record *>(leftElement->pointer);
   1035     const AlphabeticIndex::Record *rightRec = static_cast<const AlphabeticIndex::Record *>(rightElement->pointer);
   1036     const Collator *col = static_cast<const Collator *>(context);
   1037     UErrorCode errorCode = U_ZERO_ERROR;
   1038     return col->compare(leftRec->name_, rightRec->name_, errorCode);
   1039 }
   1040 
   1041 
   1042 /**
   1043  * This list contains one character per script that has the
   1044  * lowest primary weight for that script in the root collator.
   1045  * This list will be copied and sorted to account for script reordering.
   1046  *
   1047  * <p>TODO: This is fragile. If the first character of a script is tailored
   1048  * so that it does not map to the script's lowest primary weight any more,
   1049  * then the buckets will be off.
   1050  * There are hacks in the code to handle the known CJK tailorings of U+4E00.
   1051  *
   1052  * <p>We use "A" not "a" because the en_US_POSIX tailoring sorts A primary-before a.
   1053  *
   1054  * Keep this in sync with HACK_FIRST_CHARS_IN_SCRIPTS in
   1055  * ICU4J main/classes/collate/src/com/ibm/icu/text/AlphabeticIndex.java
   1056  */
   1057 static const UChar HACK_FIRST_CHARS_IN_SCRIPTS[] =  {
   1058     0x41, 0, 0x03B1, 0,
   1059     0x2C81, 0, 0x0430, 0, 0x2C30, 0, 0x10D0, 0, 0x0561, 0, 0x05D0, 0, 0xD802, 0xDD00, 0, 0x0800, 0, 0x0621, 0, 0x0710, 0,
   1060     0x0780, 0, 0x07CA, 0, 0x2D30, 0, 0x1200, 0, 0x0950, 0, 0x0985, 0, 0x0A74, 0, 0x0AD0, 0, 0x0B05, 0, 0x0BD0, 0,
   1061     0x0C05, 0, 0x0C85, 0, 0x0D05, 0, 0x0D85, 0,
   1062     0xAAF2, 0,  // Meetei Mayek
   1063     0xA800, 0, 0xA882, 0, 0xD804, 0xDC83, 0,
   1064     U16_LEAD(0x111C4), U16_TRAIL(0x111C4), 0,  // Sharada
   1065     U16_LEAD(0x11680), U16_TRAIL(0x11680), 0,  // Takri
   1066     0x1B83, 0,
   1067     0xD802, 0xDE00, 0, 0x0E01, 0,
   1068     0x0EDE, 0,  // Lao
   1069     0xAA80, 0, 0x0F40, 0, 0x1C00, 0, 0xA840, 0, 0x1900, 0, 0x1700, 0, 0x1720, 0,
   1070     0x1740, 0, 0x1760, 0, 0x1A00, 0, 0xA930, 0, 0xA90A, 0, 0x1000, 0,
   1071     U16_LEAD(0x11103), U16_TRAIL(0x11103), 0,  // Chakma
   1072     0x1780, 0, 0x1950, 0, 0x1980, 0, 0x1A20, 0,
   1073     0xAA00, 0, 0x1B05, 0, 0xA984, 0, 0x1880, 0, 0x1C5A, 0, 0x13A0, 0, 0x1401, 0, 0x1681, 0, 0x16A0, 0, 0xD803, 0xDC00, 0,
   1074     0xA500, 0, 0xA6A0, 0, 0x1100, 0, 0x3041, 0, 0x30A1, 0, 0x3105, 0, 0xA000, 0, 0xA4F8, 0,
   1075     U16_LEAD(0x16F00), U16_TRAIL(0x16F00), 0,  // Miao
   1076     0xD800, 0xDE80, 0,
   1077     0xD800, 0xDEA0, 0, 0xD802, 0xDD20, 0, 0xD800, 0xDF00, 0, 0xD800, 0xDF30, 0, 0xD801, 0xDC28, 0, 0xD801, 0xDC50, 0,
   1078     0xD801, 0xDC80, 0,
   1079     U16_LEAD(0x110D0), U16_TRAIL(0x110D0), 0,  // Sora Sompeng
   1080     0xD800, 0xDC00, 0, 0xD802, 0xDC00, 0, 0xD802, 0xDE60, 0, 0xD802, 0xDF00, 0, 0xD802, 0xDC40, 0,
   1081     0xD802, 0xDF40, 0, 0xD802, 0xDF60, 0, 0xD800, 0xDF80, 0, 0xD800, 0xDFA0, 0, 0xD808, 0xDC00, 0, 0xD80C, 0xDC00, 0,
   1082     U16_LEAD(0x109A0), U16_TRAIL(0x109A0), 0,  // Meroitic Cursive
   1083     U16_LEAD(0x10980), U16_TRAIL(0x10980), 0,  // Meroitic Hieroglyphs
   1084     0x4E00, 0,
   1085     // TODO: The overflow bucket's lowerBoundary string should be the
   1086     // first item after the last reordering group in the collator's script order.
   1087     // This should normally be the first Unicode code point
   1088     // that is unassigned (U+0378 in Unicode 6.3) and untailored.
   1089     // However, at least up to ICU 51 the Hani reordering group includes
   1090     // unassigned code points,
   1091     // and there is no stable string for the start of the trailing-weights range.
   1092     // The only known string that sorts "high" is U+FFFF.
   1093     // When ICU separates Hani vs. unassigned reordering groups, we need to fix this,
   1094     // and fix relevant test code.
   1095     // Ideally, FractionalUCA.txt will have a "script first primary"
   1096     // for unassigned code points.
   1097     0xFFFF, 0
   1098 };
   1099 
   1100 UVector *AlphabeticIndex::firstStringsInScript(UErrorCode &status) {
   1101     if (U_FAILURE(status)) {
   1102         return NULL;
   1103     }
   1104     UVector *dest = new UVector(status);
   1105     if (dest == NULL) {
   1106         status = U_MEMORY_ALLOCATION_ERROR;
   1107         return NULL;
   1108     }
   1109     dest->setDeleter(uprv_deleteUObject);
   1110     const UChar *src  = HACK_FIRST_CHARS_IN_SCRIPTS;
   1111     const UChar *limit = src + LENGTHOF(HACK_FIRST_CHARS_IN_SCRIPTS);
   1112     do {
   1113         if (U_FAILURE(status)) {
   1114             return dest;
   1115         }
   1116         UnicodeString *str = new UnicodeString(src, -1);
   1117         if (str == NULL) {
   1118             status = U_MEMORY_ALLOCATION_ERROR;
   1119             return dest;
   1120         }
   1121         dest->addElement(str, status);
   1122         src += str->length() + 1;
   1123     } while (src < limit);
   1124     return dest;
   1125 }
   1126 
   1127 
   1128 namespace {
   1129 
   1130 /**
   1131  * Returns true if one index character string is "better" than the other.
   1132  * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
   1133  * better, and otherwise binary-less-than is better.
   1134  */
   1135 UBool isOneLabelBetterThanOther(const Normalizer2 &nfkdNormalizer,
   1136                                 const UnicodeString &one, const UnicodeString &other) {
   1137     // This is called with primary-equal strings, but never with one.equals(other).
   1138     UErrorCode status = U_ZERO_ERROR;
   1139     UnicodeString n1 = nfkdNormalizer.normalize(one, status);
   1140     UnicodeString n2 = nfkdNormalizer.normalize(other, status);
   1141     if (U_FAILURE(status)) { return FALSE; }
   1142     int32_t result = n1.countChar32() - n2.countChar32();
   1143     if (result != 0) {
   1144         return result < 0;
   1145     }
   1146     result = n1.compareCodePointOrder(n2);
   1147     if (result != 0) {
   1148         return result < 0;
   1149     }
   1150     return one.compareCodePointOrder(other) < 0;
   1151 }
   1152 
   1153 }  // namespace
   1154 
   1155 //
   1156 //  Constructor & Destructor for AlphabeticIndex::Record
   1157 //
   1158 //     Records are internal only, instances are not directly surfaced in the public API.
   1159 //     This class is mostly struct-like, with all public fields.
   1160 
   1161 AlphabeticIndex::Record::Record(const UnicodeString &name, const void *data)
   1162         : name_(name), data_(data) {}
   1163 
   1164 AlphabeticIndex::Record::~Record() {
   1165 }
   1166 
   1167 
   1168 AlphabeticIndex & AlphabeticIndex::addRecord(const UnicodeString &name, const void *data, UErrorCode &status) {
   1169     if (U_FAILURE(status)) {
   1170         return *this;
   1171     }
   1172     if (inputList_ == NULL) {
   1173         inputList_ = new UVector(status);
   1174         if (inputList_ == NULL) {
   1175             status = U_MEMORY_ALLOCATION_ERROR;
   1176             return *this;
   1177         }
   1178         inputList_->setDeleter(alphaIndex_deleteRecord);
   1179     }
   1180     Record *r = new Record(name, data);
   1181     if (r == NULL) {
   1182         status = U_MEMORY_ALLOCATION_ERROR;
   1183         return *this;
   1184     }
   1185     inputList_->addElement(r, status);
   1186     clearBuckets();
   1187     //std::string ss;
   1188     //std::string ss2;
   1189     //std::cout << "added record: name = \"" << r->name_.toUTF8String(ss) << "\"" <<
   1190     //             "   sortingName = \"" << r->sortingName_.toUTF8String(ss2) << "\"" << std::endl;
   1191     return *this;
   1192 }
   1193 
   1194 
   1195 AlphabeticIndex &AlphabeticIndex::clearRecords(UErrorCode &status) {
   1196     if (U_SUCCESS(status) && inputList_ != NULL && !inputList_->isEmpty()) {
   1197         inputList_->removeAllElements();
   1198         clearBuckets();
   1199     }
   1200     return *this;
   1201 }
   1202 
   1203 int32_t AlphabeticIndex::getBucketIndex(const UnicodeString &name, UErrorCode &status) {
   1204     initBuckets(status);
   1205     if (U_FAILURE(status)) {
   1206         return 0;
   1207     }
   1208     return buckets_->getBucketIndex(name, *collatorPrimaryOnly_, status);
   1209 }
   1210 
   1211 
   1212 int32_t AlphabeticIndex::getBucketIndex() const {
   1213     return labelsIterIndex_;
   1214 }
   1215 
   1216 
   1217 UBool AlphabeticIndex::nextBucket(UErrorCode &status) {
   1218     if (U_FAILURE(status)) {
   1219         return FALSE;
   1220     }
   1221     if (buckets_ == NULL && currentBucket_ != NULL) {
   1222         status = U_ENUM_OUT_OF_SYNC_ERROR;
   1223         return FALSE;
   1224     }
   1225     initBuckets(status);
   1226     if (U_FAILURE(status)) {
   1227         return FALSE;
   1228     }
   1229     ++labelsIterIndex_;
   1230     if (labelsIterIndex_ >= buckets_->getBucketCount()) {
   1231         labelsIterIndex_ = buckets_->getBucketCount();
   1232         return FALSE;
   1233     }
   1234     currentBucket_ = getBucket(*buckets_->immutableVisibleList_, labelsIterIndex_);
   1235     resetRecordIterator();
   1236     return TRUE;
   1237 }
   1238 
   1239 const UnicodeString &AlphabeticIndex::getBucketLabel() const {
   1240     if (currentBucket_ != NULL) {
   1241         return currentBucket_->label_;
   1242     } else {
   1243         return emptyString_;
   1244     }
   1245 }
   1246 
   1247 
   1248 UAlphabeticIndexLabelType AlphabeticIndex::getBucketLabelType() const {
   1249     if (currentBucket_ != NULL) {
   1250         return currentBucket_->labelType_;
   1251     } else {
   1252         return U_ALPHAINDEX_NORMAL;
   1253     }
   1254 }
   1255 
   1256 
   1257 int32_t AlphabeticIndex::getBucketRecordCount() const {
   1258     if (currentBucket_ != NULL && currentBucket_->records_ != NULL) {
   1259         return currentBucket_->records_->size();
   1260     } else {
   1261         return 0;
   1262     }
   1263 }
   1264 
   1265 
   1266 AlphabeticIndex &AlphabeticIndex::resetBucketIterator(UErrorCode &status) {
   1267     if (U_FAILURE(status)) {
   1268         return *this;
   1269     }
   1270     internalResetBucketIterator();
   1271     return *this;
   1272 }
   1273 
   1274 
   1275 UBool AlphabeticIndex::nextRecord(UErrorCode &status) {
   1276     if (U_FAILURE(status)) {
   1277         return FALSE;
   1278     }
   1279     if (currentBucket_ == NULL) {
   1280         // We are trying to iterate over the items in a bucket, but there is no
   1281         // current bucket from the enumeration of buckets.
   1282         status = U_INVALID_STATE_ERROR;
   1283         return FALSE;
   1284     }
   1285     if (buckets_ == NULL) {
   1286         status = U_ENUM_OUT_OF_SYNC_ERROR;
   1287         return FALSE;
   1288     }
   1289     if (currentBucket_->records_ == NULL) {
   1290         return FALSE;
   1291     }
   1292     ++itemsIterIndex_;
   1293     if (itemsIterIndex_ >= currentBucket_->records_->size()) {
   1294         itemsIterIndex_  = currentBucket_->records_->size();
   1295         return FALSE;
   1296     }
   1297     return TRUE;
   1298 }
   1299 
   1300 
   1301 const UnicodeString &AlphabeticIndex::getRecordName() const {
   1302     const UnicodeString *retStr = &emptyString_;
   1303     if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
   1304         itemsIterIndex_ >= 0 &&
   1305         itemsIterIndex_ < currentBucket_->records_->size()) {
   1306             Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
   1307             retStr = &item->name_;
   1308     }
   1309     return *retStr;
   1310 }
   1311 
   1312 const void *AlphabeticIndex::getRecordData() const {
   1313     const void *retPtr = NULL;
   1314     if (currentBucket_ != NULL && currentBucket_->records_ != NULL &&
   1315         itemsIterIndex_ >= 0 &&
   1316         itemsIterIndex_ < currentBucket_->records_->size()) {
   1317             Record *item = static_cast<Record *>(currentBucket_->records_->elementAt(itemsIterIndex_));
   1318             retPtr = item->data_;
   1319     }
   1320     return retPtr;
   1321 }
   1322 
   1323 
   1324 AlphabeticIndex & AlphabeticIndex::resetRecordIterator() {
   1325     itemsIterIndex_ = -1;
   1326     return *this;
   1327 }
   1328 
   1329 
   1330 
   1331 AlphabeticIndex::Bucket::Bucket(const UnicodeString &label,
   1332                                 const UnicodeString &lowerBoundary,
   1333                                 UAlphabeticIndexLabelType type)
   1334         : label_(label), lowerBoundary_(lowerBoundary), labelType_(type),
   1335           displayBucket_(NULL), displayIndex_(-1),
   1336           records_(NULL) {
   1337 }
   1338 
   1339 
   1340 AlphabeticIndex::Bucket::~Bucket() {
   1341     delete records_;
   1342 }
   1343 
   1344 U_NAMESPACE_END
   1345 
   1346 #endif
   1347