Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  *******************************************************************************
      6  * Copyright (C) 2008-2016, Google Inc, International Business Machines Corporation
      7  * and others. All Rights Reserved.
      8  *******************************************************************************
      9  */
     10 package android.icu.text;
     11 
     12 import java.util.ArrayList;
     13 import java.util.Collections;
     14 import java.util.Comparator;
     15 import java.util.Iterator;
     16 import java.util.List;
     17 import java.util.Locale;
     18 
     19 import android.icu.lang.UCharacter;
     20 import android.icu.text.AlphabeticIndex.Bucket;
     21 import android.icu.text.AlphabeticIndex.Bucket.LabelType;
     22 import android.icu.util.LocaleData;
     23 import android.icu.util.ULocale;
     24 
     25 /**
     26  * AlphabeticIndex supports the creation of a UI index appropriate for a given language.
     27  * It can support either direct use, or use with a client that doesn't support localized collation.
     28  * The following is an example of what an index might look like in a UI:
     29  *
     30  * <pre>
     31  *  <b>... A B C D E F G H I J K L M N O P Q R S T U V W X Y Z  ...</b>
     32  *
     33  *  <b>A</b>
     34  *     Addison
     35  *     Albertson
     36  *     Azensky
     37  *  <b>B</b>
     38  *     Baecker
     39  *  ...
     40  * </pre>
     41  *
     42  * The class can generate a list of labels for use as a UI "index", that is, a list of
     43  * clickable characters (or character sequences) that allow the user to see a segment
     44  * (bucket) of a larger "target" list. That is, each label corresponds to a bucket in
     45  * the target list, where everything in the bucket is greater than or equal to the character
     46  * (according to the locale's collation). Strings can be added to the index;
     47  * they will be in sorted order in the right bucket.
     48  * <p>
     49  * The class also supports having buckets for strings before the first (underflow),
     50  * after the last (overflow), and between scripts (inflow). For example, if the index
     51  * is constructed with labels for Russian and English, Greek characters would fall
     52  * into an inflow bucket between the other two scripts.
     53  *
     54  * <p><em>Note:</em> If you expect to have a lot of ASCII or Latin characters
     55  * as well as characters from the user's language,
     56  * then it is a good idea to call addLabels(ULocale.English).
     57  *
     58  * <h2>Direct Use</h2>
     59  * <p>The following shows an example of building an index directly.
     60  *  The "show..." methods below are just to illustrate usage.
     61  *
     62  * <pre>
     63  * // Create a simple index where the values for the strings are Integers, and add the strings
     64  *
     65  * AlphabeticIndex&lt;Integer&gt; index = new AlphabeticIndex&lt;Integer&gt;(desiredLocale).addLabels(additionalLocale);
     66  * int counter = 0;
     67  * for (String item : test) {
     68  *     index.addRecord(item, counter++);
     69  * }
     70  * ...
     71  * // Show index at top. We could skip or gray out empty buckets
     72  *
     73  * for (AlphabeticIndex.Bucket&lt;Integer&gt; bucket : index) {
     74  *     if (showAll || bucket.size() != 0) {
     75  *         showLabelAtTop(UI, bucket.getLabel());
     76  *     }
     77  * }
     78  *  ...
     79  * // Show the buckets with their contents, skipping empty buckets
     80  *
     81  * for (AlphabeticIndex.Bucket&lt;Integer&gt; bucket : index) {
     82  *     if (bucket.size() != 0) {
     83  *         showLabelInList(UI, bucket.getLabel());
     84  *         for (AlphabeticIndex.Record&lt;Integer&gt; item : bucket) {
     85  *             showIndexedItem(UI, item.getName(), item.getData());
     86  *         }
     87  * </pre>
     88  *
     89  * The caller can build different UIs using this class.
     90  * For example, an index character could be omitted or grayed-out
     91  * if its bucket is empty. Small buckets could also be combined based on size, such as:
     92  *
     93  * <pre>
     94  * <b>... A-F G-N O-Z ...</b>
     95  * </pre>
     96  *
     97  * <h2>Client Support</h2>
     98  * <p>Callers can also use the {@link AlphabeticIndex.ImmutableIndex}, or the AlphabeticIndex itself,
     99  * to support sorting on a client that doesn't support AlphabeticIndex functionality.
    100  *
    101  * <p>The ImmutableIndex is both immutable and thread-safe.
    102  * The corresponding AlphabeticIndex methods are not thread-safe because
    103  * they "lazily" build the index buckets.
    104  * <ul>
    105  * <li>ImmutableIndex.getBucket(index) provides random access to all
    106  *     buckets and their labels and label types.
    107  * <li>AlphabeticIndex.getBucketLabels() or the bucket iterator on either class
    108  *     can be used to get a list of the labels,
    109  *     such as "...", "A", "B",..., and send that list to the client.
    110  * <li>When the client has a new name, it sends that name to the server.
    111  * The server needs to call the following methods,
    112  * and communicate the bucketIndex and collationKey back to the client.
    113  *
    114  * <pre>
    115  * int bucketIndex = index.getBucketIndex(name);
    116  * String label = immutableIndex.getBucket(bucketIndex).getLabel();  // optional
    117  * RawCollationKey collationKey = collator.getRawCollationKey(name, null);
    118  * </pre>
    119  *
    120  * <li>The client would put the name (and associated information) into its bucket for bucketIndex. The collationKey is a
    121  * sequence of bytes that can be compared with a binary compare, and produce the right localized result.</li>
    122  * </ul>
    123  *
    124  * @author Mark Davis
    125  */
    126 public final class AlphabeticIndex<V> implements Iterable<Bucket<V>> {
    127     /**
    128      * Prefix string for Chinese index buckets.
    129      * See http://unicode.org/repos/cldr/trunk/specs/ldml/tr35-collation.html#Collation_Indexes
    130      */
    131     private static final String BASE = "\uFDD0";
    132 
    133     private static final char CGJ = '\u034F';
    134 
    135     private static final Comparator<String> binaryCmp = new UTF16.StringComparator(true, false, 0);
    136 
    137     private final RuleBasedCollator collatorOriginal;
    138     private final RuleBasedCollator collatorPrimaryOnly;
    139     private RuleBasedCollator collatorExternal;
    140 
    141     // Comparator for records, so that the Record class can be static.
    142     private final Comparator<Record<V>> recordComparator = new Comparator<Record<V>>() {
    143         @Override
    144         public int compare(Record<V> o1, Record<V> o2) {
    145             return collatorOriginal.compare(o1.name, o2.name);
    146         }
    147     };
    148 
    149     private final List<String> firstCharsInScripts;
    150 
    151     // We accumulate these as we build up the input parameters
    152     private final UnicodeSet initialLabels = new UnicodeSet();
    153     private List<Record<V>> inputList;
    154 
    155     // Lazy evaluated: null means that we have not built yet.
    156     private BucketList<V> buckets;
    157 
    158     private String overflowLabel = "\u2026";
    159     private String underflowLabel = "\u2026";
    160     private String inflowLabel = "\u2026";
    161 
    162     /**
    163      * Immutable, thread-safe version of {@link AlphabeticIndex}.
    164      * This class provides thread-safe methods for bucketing,
    165      * and random access to buckets and their properties,
    166      * but does not offer adding records to the index.
    167      *
    168      * @param <V> The Record value type is unused. It can be omitted for this class
    169      * if it was omitted for the AlphabeticIndex that built it.
    170      */
    171     public static final class ImmutableIndex<V> implements Iterable<Bucket<V>> {
    172         private final BucketList<V> buckets;
    173         private final Collator collatorPrimaryOnly;
    174 
    175         private ImmutableIndex(BucketList<V> bucketList, Collator collatorPrimaryOnly) {
    176             this.buckets = bucketList;
    177             this.collatorPrimaryOnly = collatorPrimaryOnly;
    178         }
    179 
    180         /**
    181          * Returns the number of index buckets and labels, including underflow/inflow/overflow.
    182          *
    183          * @return the number of index buckets
    184          */
    185         public int getBucketCount() {
    186             return buckets.getBucketCount();
    187         }
    188 
    189         /**
    190          * Finds the index bucket for the given name and returns the number of that bucket.
    191          * Use {@link #getBucket(int)} to get the bucket's properties.
    192          *
    193          * @param name the string to be sorted into an index bucket
    194          * @return the bucket number for the name
    195          */
    196         public int getBucketIndex(CharSequence name) {
    197             return buckets.getBucketIndex(name, collatorPrimaryOnly);
    198         }
    199 
    200         /**
    201          * Returns the index-th bucket. Returns null if the index is out of range.
    202          *
    203          * @param index bucket number
    204          * @return the index-th bucket
    205          */
    206         public Bucket<V> getBucket(int index) {
    207             if (0 <= index && index < buckets.getBucketCount()) {
    208                 return buckets.immutableVisibleList.get(index);
    209             } else {
    210                 return null;
    211             }
    212         }
    213 
    214         /**
    215          * {@inheritDoc}
    216          */
    217         @Override
    218         public Iterator<Bucket<V>> iterator() {
    219             return buckets.iterator();
    220         }
    221     }
    222 
    223     /**
    224      * Create the index object.
    225      *
    226      * @param locale
    227      *            The locale for the index.
    228      */
    229     public AlphabeticIndex(ULocale locale) {
    230         this(locale, null);
    231     }
    232 
    233     /**
    234      * Create the index object.
    235      *
    236      * @param locale
    237      *            The locale for the index.
    238      */
    239     public AlphabeticIndex(Locale locale) {
    240         this(ULocale.forLocale(locale), null);
    241     }
    242 
    243     /**
    244      * Create an AlphabeticIndex that uses a specific collator.
    245      *
    246      * <p>The index will be created with no labels; the addLabels() function must be called
    247      * after creation to add the desired labels to the index.
    248      *
    249      * <p>The index will work directly with the supplied collator. If the caller will need to
    250      * continue working with the collator it should be cloned first, so that the
    251      * collator provided to the AlphabeticIndex remains unchanged after creation of the index.
    252      *
    253      * @param collator The collator to use to order the contents of this index.
    254      */
    255     public AlphabeticIndex(RuleBasedCollator collator) {
    256         this(null, collator);
    257     }
    258 
    259     /**
    260      * Internal constructor containing implementation used by public constructors.
    261      */
    262     private AlphabeticIndex(ULocale locale, RuleBasedCollator collator) {
    263         collatorOriginal = collator != null ? collator : (RuleBasedCollator) Collator.getInstance(locale);
    264         try {
    265             collatorPrimaryOnly = collatorOriginal.cloneAsThawed();
    266         } catch (Exception e) {
    267             // should never happen
    268             throw new IllegalStateException("Collator cannot be cloned", e);
    269         }
    270         collatorPrimaryOnly.setStrength(Collator.PRIMARY);
    271         collatorPrimaryOnly.freeze();
    272 
    273         firstCharsInScripts = getFirstCharactersInScripts();
    274         Collections.sort(firstCharsInScripts, collatorPrimaryOnly);
    275         // Guard against a degenerate collator where
    276         // some script boundary strings are primary ignorable.
    277         for (;;) {
    278             if (firstCharsInScripts.isEmpty()) {
    279                 throw new IllegalArgumentException(
    280                         "AlphabeticIndex requires some non-ignorable script boundary strings");
    281             }
    282             if (collatorPrimaryOnly.compare(firstCharsInScripts.get(0), "") == 0) {
    283                 firstCharsInScripts.remove(0);
    284             } else {
    285                 break;
    286             }
    287         }
    288 
    289         // Chinese index characters, which are specific to each of the several Chinese tailorings,
    290         // take precedence over the single locale data exemplar set per language.
    291         if (!addChineseIndexCharacters() && locale != null) {
    292             addIndexExemplars(locale);
    293         }
    294     }
    295 
    296     /**
    297      * Add more index characters (aside from what are in the locale)
    298      * @param additions additional characters to add to the index, such as A-Z.
    299      * @return this, for chaining
    300      */
    301     public AlphabeticIndex<V> addLabels(UnicodeSet additions) {
    302         initialLabels.addAll(additions);
    303         buckets = null;
    304         return this;
    305     }
    306 
    307     /**
    308      * Add more index characters (aside from what are in the locale)
    309      * @param additions additional characters to add to the index, such as those in Swedish.
    310      * @return this, for chaining
    311      */
    312     public AlphabeticIndex<V> addLabels(ULocale... additions) {
    313         for (ULocale addition : additions) {
    314             addIndexExemplars(addition);
    315         }
    316         buckets = null;
    317         return this;
    318     }
    319 
    320     /**
    321      * Add more index characters (aside from what are in the locale)
    322      * @param additions additional characters to add to the index, such as those in Swedish.
    323      * @return this, for chaining
    324      */
    325     public AlphabeticIndex<V> addLabels(Locale... additions) {
    326         for (Locale addition : additions) {
    327             addIndexExemplars(ULocale.forLocale(addition));
    328         }
    329         buckets = null;
    330         return this;
    331     }
    332 
    333     /**
    334      * Set the overflow label
    335      * @param overflowLabel see class description
    336      * @return this, for chaining
    337      */
    338     public AlphabeticIndex<V> setOverflowLabel(String overflowLabel) {
    339         this.overflowLabel = overflowLabel;
    340         buckets = null;
    341         return this;
    342     }
    343 
    344     /**
    345      * Get the default label used in the IndexCharacters' locale for underflow, eg the last item in: X Y Z ...
    346      *
    347      * @return underflow label
    348      */
    349     public String getUnderflowLabel() {
    350         return underflowLabel; // TODO get localized version
    351     }
    352 
    353 
    354     /**
    355      * Set the underflowLabel label
    356      * @param underflowLabel see class description
    357      * @return this, for chaining
    358      */
    359     public AlphabeticIndex<V> setUnderflowLabel(String underflowLabel) {
    360         this.underflowLabel = underflowLabel;
    361         buckets = null;
    362         return this;
    363     }
    364 
    365     /**
    366      * Get the default label used in the IndexCharacters' locale for overflow, eg the first item in: ... A B C
    367      *
    368      * @return overflow label
    369      */
    370     public String getOverflowLabel() {
    371         return overflowLabel; // TODO get localized version
    372     }
    373 
    374 
    375     /**
    376      * Set the inflowLabel label
    377      * @param inflowLabel see class description
    378      * @return this, for chaining
    379      */
    380     public AlphabeticIndex<V> setInflowLabel(String inflowLabel) {
    381         this.inflowLabel = inflowLabel;
    382         buckets = null;
    383         return this;
    384     }
    385 
    386     /**
    387      * Get the default label used for abbreviated buckets <i>between</i> other labels. For example, consider the labels
    388      * for Latin and Greek are used: X Y Z ... &#x0391; &#x0392; &#x0393;.
    389      *
    390      * @return inflow label
    391      */
    392     public String getInflowLabel() {
    393         return inflowLabel; // TODO get localized version
    394     }
    395 
    396 
    397     /**
    398      * Get the limit on the number of labels in the index. The number of buckets can be slightly larger: see getBucketCount().
    399      *
    400      * @return maxLabelCount maximum number of labels.
    401      */
    402     public int getMaxLabelCount() {
    403         return maxLabelCount;
    404     }
    405 
    406     /**
    407      * Set a limit on the number of labels in the index. The number of buckets can be slightly larger: see
    408      * getBucketCount().
    409      *
    410      * @param maxLabelCount Set the maximum number of labels. Currently, if the number is exceeded, then every
    411      *         nth item is removed to bring the count down. A more sophisticated mechanism may be available in the
    412      *         future.
    413      * @return this, for chaining
    414      */
    415     public AlphabeticIndex<V> setMaxLabelCount(int maxLabelCount) {
    416         this.maxLabelCount = maxLabelCount;
    417         buckets = null;
    418         return this;
    419     }
    420 
    421     /**
    422      * Determine the best labels to use. This is based on the exemplars, but we also process to make sure that they are unique,
    423      * and sort differently, and that the overall list is small enough.
    424      */
    425     private List<String> initLabels() {
    426         Normalizer2 nfkdNormalizer = Normalizer2.getNFKDInstance();
    427         List<String> indexCharacters = new ArrayList<String>();
    428 
    429         String firstScriptBoundary = firstCharsInScripts.get(0);
    430         String overflowBoundary = firstCharsInScripts.get(firstCharsInScripts.size() - 1);
    431 
    432         // We make a sorted array of elements.
    433         // Some of the input may be redundant.
    434         // That is, we might have c, ch, d, where "ch" sorts just like "c", "h".
    435         // We filter out those cases.
    436         for (String item : initialLabels) {
    437             boolean checkDistinct;
    438             if (!UTF16.hasMoreCodePointsThan(item, 1)) {
    439                 checkDistinct = false;
    440             } else if(item.charAt(item.length() - 1) == '*' &&
    441                     item.charAt(item.length() - 2) != '*') {
    442                 // Use a label if it is marked with one trailing star,
    443                 // even if the label string sorts the same when all contractions are suppressed.
    444                 item = item.substring(0, item.length() - 1);
    445                 checkDistinct = false;
    446             } else {
    447                 checkDistinct = true;
    448             }
    449             if (collatorPrimaryOnly.compare(item, firstScriptBoundary) < 0) {
    450                 // Ignore a primary-ignorable or non-alphabetic index character.
    451             } else if (collatorPrimaryOnly.compare(item, overflowBoundary) >= 0) {
    452                 // Ignore an index character that will land in the overflow bucket.
    453             } else if (checkDistinct && collatorPrimaryOnly.compare(item, separated(item)) == 0) {
    454                 // Ignore a multi-code point index character that does not sort distinctly
    455                 // from the sequence of its separate characters.
    456             } else {
    457                 int insertionPoint = Collections.binarySearch(indexCharacters, item, collatorPrimaryOnly);
    458                 if (insertionPoint < 0) {
    459                     indexCharacters.add(~insertionPoint, item);
    460                 } else {
    461                     String itemAlreadyIn = indexCharacters.get(insertionPoint);
    462                     if (isOneLabelBetterThanOther(nfkdNormalizer, item, itemAlreadyIn)) {
    463                         indexCharacters.set(insertionPoint, item);
    464                     }
    465                 }
    466             }
    467         }
    468 
    469         // if the result is still too large, cut down to maxLabelCount elements, by removing every nth element
    470 
    471         final int size = indexCharacters.size() - 1;
    472         if (size > maxLabelCount) {
    473             int count = 0;
    474             int old = -1;
    475             for (Iterator<String> it = indexCharacters.iterator(); it.hasNext();) {
    476                 ++count;
    477                 it.next();
    478                 final int bump = count * maxLabelCount / size;
    479                 if (bump == old) {
    480                     it.remove();
    481                 } else {
    482                     old = bump;
    483                 }
    484             }
    485         }
    486 
    487         return indexCharacters;
    488     }
    489 
    490     private static String fixLabel(String current) {
    491         if (!current.startsWith(BASE)) {
    492             return current;
    493         }
    494         int rest = current.charAt(BASE.length());
    495         if (0x2800 < rest && rest <= 0x28FF) { // stroke count
    496             return (rest-0x2800) + "\u5283";
    497         }
    498         return current.substring(BASE.length());
    499     }
    500 
    501     /**
    502      * This method is called to get the index exemplars. Normally these come from the locale directly,
    503      * but if they aren't available, we have to synthesize them.
    504      */
    505     private void addIndexExemplars(ULocale locale) {
    506         UnicodeSet exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_INDEX);
    507         // Android-changed: check for empty exemplar sets (http://b/64953401).
    508         if (exemplars != null && !exemplars.isEmpty()) {
    509             initialLabels.addAll(exemplars);
    510             return;
    511         }
    512 
    513         // The locale data did not include explicit Index characters.
    514         // Synthesize a set of them from the locale's standard exemplar characters.
    515         exemplars = LocaleData.getExemplarSet(locale, 0, LocaleData.ES_STANDARD);
    516 
    517         exemplars = exemplars.cloneAsThawed();
    518         // question: should we add auxiliary exemplars?
    519         if (exemplars.containsSome('a', 'z') || exemplars.size() == 0) {
    520             exemplars.addAll('a', 'z');
    521         }
    522         if (exemplars.containsSome(0xAC00, 0xD7A3)) {  // Hangul syllables
    523             // cut down to small list
    524             exemplars.remove(0xAC00, 0xD7A3).
    525                 add(0xAC00).add(0xB098).add(0xB2E4).add(0xB77C).
    526                 add(0xB9C8).add(0xBC14).add(0xC0AC).add(0xC544).
    527                 add(0xC790).add(0xCC28).add(0xCE74).add(0xD0C0).
    528                 add(0xD30C).add(0xD558);
    529         }
    530         if (exemplars.containsSome(0x1200, 0x137F)) {  // Ethiopic block
    531             // cut down to small list
    532             // make use of the fact that Ethiopic is allocated in 8's, where
    533             // the base is 0 mod 8.
    534             UnicodeSet ethiopic = new UnicodeSet("[[:Block=Ethiopic:]&[:Script=Ethiopic:]]");
    535             UnicodeSetIterator it = new UnicodeSetIterator(ethiopic);
    536             while (it.next() && it.codepoint != UnicodeSetIterator.IS_STRING) {
    537                 if ((it.codepoint & 0x7) != 0) {
    538                     exemplars.remove(it.codepoint);
    539                 }
    540             }
    541         }
    542 
    543         // Upper-case any that aren't already so.
    544         //   (We only do this for synthesized index characters.)
    545         for (String item : exemplars) {
    546             initialLabels.add(UCharacter.toUpperCase(locale, item));
    547         }
    548     }
    549 
    550     /**
    551      * Add Chinese index characters from the tailoring.
    552      */
    553     private boolean addChineseIndexCharacters() {
    554         UnicodeSet contractions = new UnicodeSet();
    555         try {
    556             collatorPrimaryOnly.internalAddContractions(BASE.charAt(0), contractions);
    557         } catch (Exception e) {
    558             return false;
    559         }
    560         if (contractions.isEmpty()) { return false; }
    561         initialLabels.addAll(contractions);
    562         for (String s : contractions) {
    563             assert(s.startsWith(BASE));
    564             char c = s.charAt(s.length() - 1);
    565             if (0x41 <= c && c <= 0x5A) {  // A-Z
    566                 // There are Pinyin labels, add ASCII A-Z labels as well.
    567                 initialLabels.add(0x41, 0x5A);  // A-Z
    568                 break;
    569             }
    570         }
    571         return true;
    572     }
    573 
    574     /**
    575      * Return the string with interspersed CGJs. Input must have more than 2 codepoints.
    576      * <p>This is used to test whether contractions sort differently from their components.
    577      */
    578     private String separated(String item) {
    579         StringBuilder result = new StringBuilder();
    580         // add a CGJ except within surrogates
    581         char last = item.charAt(0);
    582         result.append(last);
    583         for (int i = 1; i < item.length(); ++i) {
    584             char ch = item.charAt(i);
    585             if (!UCharacter.isHighSurrogate(last) || !UCharacter.isLowSurrogate(ch)) {
    586                 result.append(CGJ);
    587             }
    588             result.append(ch);
    589             last = ch;
    590         }
    591         return result.toString();
    592     }
    593 
    594     /**
    595      * Builds an immutable, thread-safe version of this instance, without data records.
    596      *
    597      * @return an immutable index instance
    598      */
    599     public ImmutableIndex<V> buildImmutableIndex() {
    600         // The current AlphabeticIndex Java code never modifies the bucket list once built.
    601         // If it contains no records, we can use it.
    602         // addRecord() sets buckets=null rather than inserting the new record into it.
    603         BucketList<V> immutableBucketList;
    604         if (inputList != null && !inputList.isEmpty()) {
    605             // We need a bucket list with no records.
    606             immutableBucketList = createBucketList();
    607         } else {
    608             if (buckets == null) {
    609                 buckets = createBucketList();
    610             }
    611             immutableBucketList = buckets;
    612         }
    613         return new ImmutableIndex<V>(immutableBucketList, collatorPrimaryOnly);
    614     }
    615 
    616     /**
    617      * Get the labels.
    618      *
    619      * @return The list of bucket labels, after processing.
    620      */
    621     public List<String> getBucketLabels() {
    622         initBuckets();
    623         ArrayList<String> result = new ArrayList<String>();
    624         for (Bucket<V> bucket : buckets) {
    625             result.add(bucket.getLabel());
    626         }
    627         return result;
    628     }
    629 
    630     /**
    631      * Get a clone of the collator used internally. Note that for performance reasons, the clone is only done once, and
    632      * then stored. The next time it is accessed, the same instance is returned.
    633      * <p>
    634      * <b><i>Don't use this method across threads if you are changing the settings on the collator, at least not without
    635      * synchronizing.</i></b>
    636      *
    637      * @return a clone of the collator used internally
    638      */
    639     public RuleBasedCollator getCollator() {
    640         if (collatorExternal == null) {
    641             try {
    642                 collatorExternal = (RuleBasedCollator) (collatorOriginal.clone());
    643             } catch (Exception e) {
    644                 // should never happen
    645                 throw new IllegalStateException("Collator cannot be cloned", e);
    646             }
    647         }
    648         return collatorExternal;
    649     }
    650 
    651     /**
    652      * Add a record (name and data) to the index. The name will be used to sort the items into buckets, and to sort
    653      * within the bucket. Two records may have the same name. When they do, the sort order is according to the order added:
    654      * the first added comes first.
    655      *
    656      * @param name
    657      *            Name, such as a name
    658      * @param data
    659      *            Data, such as an address or link
    660      * @return this, for chaining
    661      */
    662     public AlphabeticIndex<V> addRecord(CharSequence name, V data) {
    663         // TODO instead of invalidating, just add to unprocessed list.
    664         buckets = null; // invalidate old bucketlist
    665         if (inputList == null) {
    666             inputList = new ArrayList<Record<V>>();
    667         }
    668         inputList.add(new Record<V>(name, data));
    669         return this;
    670     }
    671 
    672     /**
    673      * Get the bucket number for the given name. This routine permits callers to implement their own bucket handling
    674      * mechanisms, including client-server handling. For example, when a new name is created on the client, it can ask
    675      * the server for the bucket for that name, and the sortkey (using getCollator). Once the client has that
    676      * information, it can put the name into the right bucket, and sort it within that bucket, without having access to
    677      * the index or collator.
    678      * <p>
    679      * Note that the bucket number (and sort key) are only valid for the settings of the current AlphabeticIndex; if
    680      * those are changed, then the bucket number and sort key must be regenerated.
    681      *
    682      * @param name
    683      *            Name, such as a name
    684      * @return the bucket index for the name
    685      */
    686     public int getBucketIndex(CharSequence name) {
    687         initBuckets();
    688         return buckets.getBucketIndex(name, collatorPrimaryOnly);
    689     }
    690 
    691     /**
    692      * Clear the index.
    693      *
    694      * @return this, for chaining
    695      */
    696     public AlphabeticIndex<V> clearRecords() {
    697         if (inputList != null && !inputList.isEmpty()) {
    698             inputList.clear();
    699             buckets = null;
    700         }
    701         return this;
    702     }
    703 
    704     /**
    705      * Return the number of buckets in the index. This will be the same as the number of labels, plus buckets for the underflow, overflow, and inflow(s).
    706      *
    707      * @return number of buckets
    708      */
    709     public int getBucketCount() {
    710         initBuckets();
    711         return buckets.getBucketCount();
    712     }
    713 
    714     /**
    715      * Return the number of records in the index: that is, the total number of distinct &lt;name,data&gt; pairs added with addRecord(...), over all the buckets.
    716      *
    717      * @return total number of records in buckets
    718      */
    719     public int getRecordCount() {
    720         return inputList != null ? inputList.size() : 0;
    721     }
    722 
    723     /**
    724      * Return an iterator over the buckets.
    725      *
    726      * @return iterator over buckets.
    727      */
    728     @Override
    729     public Iterator<Bucket<V>> iterator() {
    730         initBuckets();
    731         return buckets.iterator();
    732     }
    733 
    734     /**
    735      * Creates an index, and buckets and sorts the list of records into the index.
    736      */
    737     private void initBuckets() {
    738         if (buckets != null) {
    739             return;
    740         }
    741         buckets = createBucketList();
    742         if (inputList == null || inputList.isEmpty()) {
    743             return;
    744         }
    745 
    746         // Sort the records by name.
    747         // Stable sort preserves input order of collation duplicates.
    748         Collections.sort(inputList, recordComparator);
    749 
    750         // Now, we traverse all of the input, which is now sorted.
    751         // If the item doesn't go in the current bucket, we find the next bucket that contains it.
    752         // This makes the process order n*log(n), since we just sort the list and then do a linear process.
    753         // However, if the user adds an item at a time and then gets the buckets, this isn't efficient, so
    754         // we need to improve it for that case.
    755 
    756         Iterator<Bucket<V>> bucketIterator = buckets.fullIterator();
    757         Bucket<V> currentBucket = bucketIterator.next();
    758         Bucket<V> nextBucket;
    759         String upperBoundary;
    760         if (bucketIterator.hasNext()) {
    761             nextBucket = bucketIterator.next();
    762             upperBoundary = nextBucket.lowerBoundary;
    763         } else {
    764             nextBucket = null;
    765             upperBoundary = null;
    766         }
    767         for (Record<V> r : inputList) {
    768             // if the current bucket isn't the right one, find the one that is
    769             // We have a special flag for the last bucket so that we don't look any further
    770             while (upperBoundary != null &&
    771                     collatorPrimaryOnly.compare(r.name, upperBoundary) >= 0) {
    772                 currentBucket = nextBucket;
    773                 // now reset the boundary that we compare against
    774                 if (bucketIterator.hasNext()) {
    775                     nextBucket = bucketIterator.next();
    776                     upperBoundary = nextBucket.lowerBoundary;
    777                 } else {
    778                     upperBoundary = null;
    779                 }
    780             }
    781             // now put the record into the bucket.
    782             Bucket<V> bucket = currentBucket;
    783             if (bucket.displayBucket != null) {
    784                 bucket = bucket.displayBucket;
    785             }
    786             if (bucket.records == null) {
    787                 bucket.records = new ArrayList<Record<V>>();
    788             }
    789             bucket.records.add(r);
    790         }
    791     }
    792 
    793     private int maxLabelCount = 99;
    794 
    795     /**
    796      * Returns true if one index character string is "better" than the other.
    797      * Shorter NFKD is better, and otherwise NFKD-binary-less-than is
    798      * better, and otherwise binary-less-than is better.
    799      */
    800     private static boolean isOneLabelBetterThanOther(Normalizer2 nfkdNormalizer, String one, String other) {
    801         // This is called with primary-equal strings, but never with one.equals(other).
    802         String n1 = nfkdNormalizer.normalize(one);
    803         String n2 = nfkdNormalizer.normalize(other);
    804         int result = n1.codePointCount(0, n1.length()) - n2.codePointCount(0, n2.length());
    805         if (result != 0) {
    806             return result < 0;
    807         }
    808         result = binaryCmp.compare(n1, n2);
    809         if (result != 0) {
    810             return result < 0;
    811         }
    812         return binaryCmp.compare(one, other) < 0;
    813     }
    814 
    815     /**
    816      * A (name, data) pair, to be sorted by name into one of the index buckets.
    817      * The user data is not used by the index implementation.
    818      */
    819     public static class Record<V> {
    820         private final CharSequence name;
    821         private final V data;
    822 
    823         private Record(CharSequence name, V data) {
    824             this.name = name;
    825             this.data = data;
    826         }
    827 
    828         /**
    829          * Get the name
    830          *
    831          * @return the name
    832          */
    833         public CharSequence getName() {
    834             return name;
    835         }
    836 
    837         /**
    838          * Get the data
    839          *
    840          * @return the data
    841          */
    842         public V getData() {
    843             return data;
    844         }
    845 
    846         /**
    847          * Standard toString()
    848          */
    849         @Override
    850         public String toString() {
    851             return name + "=" + data;
    852         }
    853     }
    854 
    855     /**
    856      * An index "bucket" with a label string and type.
    857      * It is referenced by {@link AlphabeticIndex#getBucketIndex(CharSequence)}
    858      * and {@link AlphabeticIndex.ImmutableIndex#getBucketIndex(CharSequence)},
    859      * returned by {@link AlphabeticIndex.ImmutableIndex#getBucket(int)},
    860      * and {@link AlphabeticIndex#addRecord(CharSequence, Object)} adds a record
    861      * into a bucket according to the record's name.
    862      *
    863      * @param <V>
    864      *            Data type
    865      */
    866     public static class Bucket<V> implements Iterable<Record<V>> {
    867         private final String label;
    868         private final String lowerBoundary;
    869         private final LabelType labelType;
    870         private Bucket<V> displayBucket;
    871         private int displayIndex;
    872         private List<Record<V>> records;
    873 
    874         /**
    875          * Type of the label
    876          */
    877         public enum LabelType {
    878             /**
    879              * Normal
    880              */
    881             NORMAL,
    882             /**
    883              * Underflow (before the first)
    884              */
    885             UNDERFLOW,
    886             /**
    887              * Inflow (between scripts)
    888              */
    889             INFLOW,
    890             /**
    891              * Overflow (after the last)
    892              */
    893             OVERFLOW
    894         }
    895 
    896         /**
    897          * Set up the bucket.
    898          *
    899          * @param label
    900          *            label for the bucket
    901          * @param labelType
    902          *            is an underflow, overflow, or inflow bucket
    903          */
    904         private Bucket(String label, String lowerBoundary, LabelType labelType) {
    905             this.label = label;
    906             this.lowerBoundary = lowerBoundary;
    907             this.labelType = labelType;
    908         }
    909 
    910         /**
    911          * Get the label
    912          *
    913          * @return label for the bucket
    914          */
    915         public String getLabel() {
    916             return label;
    917         }
    918 
    919         /**
    920          * Is a normal, underflow, overflow, or inflow bucket
    921          *
    922          * @return is an underflow, overflow, or inflow bucket
    923          */
    924         public LabelType getLabelType() {
    925             return labelType;
    926         }
    927 
    928         /**
    929          * Get the number of records in the bucket.
    930          *
    931          * @return number of records in bucket
    932          */
    933         public int size() {
    934             return records == null ? 0 : records.size();
    935         }
    936 
    937         /**
    938          * Iterator over the records in the bucket
    939          */
    940         @Override
    941         public Iterator<Record<V>> iterator() {
    942             if (records == null) {
    943                 return Collections.<Record<V>>emptyList().iterator();
    944             }
    945             return records.iterator();
    946         }
    947 
    948         /**
    949          * Standard toString()
    950          */
    951         @Override
    952         public String toString() {
    953             return "{" +
    954             "labelType=" + labelType
    955             + ", " +
    956             "lowerBoundary=" + lowerBoundary
    957             + ", " +
    958             "label=" + label
    959             + "}"
    960             ;
    961         }
    962     }
    963 
    964     private BucketList<V> createBucketList() {
    965         // Initialize indexCharacters.
    966         List<String> indexCharacters = initLabels();
    967 
    968         // Variables for hasMultiplePrimaryWeights().
    969         long variableTop;
    970         if (collatorPrimaryOnly.isAlternateHandlingShifted()) {
    971             variableTop = collatorPrimaryOnly.getVariableTop() & 0xffffffffL;
    972         } else {
    973             variableTop = 0;
    974         }
    975         boolean hasInvisibleBuckets = false;
    976 
    977         // Helper arrays for Chinese Pinyin collation.
    978         @SuppressWarnings({ "rawtypes", "unchecked" })
    979         Bucket<V>[] asciiBuckets = new Bucket[26];
    980         @SuppressWarnings({ "rawtypes", "unchecked" })
    981         Bucket<V>[] pinyinBuckets = new Bucket[26];
    982         boolean hasPinyin = false;
    983 
    984         ArrayList<Bucket<V>> bucketList = new ArrayList<Bucket<V>>();
    985 
    986         // underflow bucket
    987         bucketList.add(new Bucket<V>(getUnderflowLabel(), "", LabelType.UNDERFLOW));
    988 
    989         // fix up the list, adding underflow, additions, overflow
    990         // Insert inflow labels as needed.
    991         int scriptIndex = -1;
    992         String scriptUpperBoundary = "";
    993         for (String current : indexCharacters) {
    994             if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) >= 0) {
    995                 // We crossed the script boundary into a new script.
    996                 String inflowBoundary = scriptUpperBoundary;
    997                 boolean skippedScript = false;
    998                 for (;;) {
    999                     scriptUpperBoundary = firstCharsInScripts.get(++scriptIndex);
   1000                     if (collatorPrimaryOnly.compare(current, scriptUpperBoundary) < 0) {
   1001                         break;
   1002                     }
   1003                     skippedScript = true;
   1004                 }
   1005                 if (skippedScript && bucketList.size() > 1) {
   1006                     // We are skipping one or more scripts,
   1007                     // and we are not just getting out of the underflow label.
   1008                     bucketList.add(new Bucket<V>(getInflowLabel(), inflowBoundary,
   1009                             LabelType.INFLOW));
   1010                 }
   1011             }
   1012             // Add a bucket with the current label.
   1013             Bucket<V> bucket = new Bucket<V>(fixLabel(current), current, LabelType.NORMAL);
   1014             bucketList.add(bucket);
   1015             // Remember ASCII and Pinyin buckets for Pinyin redirects.
   1016             char c;
   1017             if (current.length() == 1 && 'A' <= (c = current.charAt(0)) && c <= 'Z') {
   1018                 asciiBuckets[c - 'A'] = bucket;
   1019             } else if (current.length() == BASE.length() + 1 && current.startsWith(BASE) &&
   1020                     'A' <= (c = current.charAt(BASE.length())) && c <= 'Z') {
   1021                 pinyinBuckets[c - 'A'] = bucket;
   1022                 hasPinyin = true;
   1023             }
   1024             // Check for multiple primary weights.
   1025             if (!current.startsWith(BASE) &&
   1026                     hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, current) &&
   1027                     !current.endsWith("\uffff")) {
   1028                 // "" or "Sch" etc.
   1029                 for (int i = bucketList.size() - 2;; --i) {
   1030                     Bucket<V> singleBucket = bucketList.get(i);
   1031                     if (singleBucket.labelType != LabelType.NORMAL) {
   1032                         // There is no single-character bucket since the last
   1033                         // underflow or inflow label.
   1034                         break;
   1035                     }
   1036                     if (singleBucket.displayBucket == null &&
   1037                             !hasMultiplePrimaryWeights(collatorPrimaryOnly, variableTop, singleBucket.lowerBoundary)) {
   1038                         // Add an invisible bucket that redirects strings greater than the expansion
   1039                         // to the previous single-character bucket.
   1040                         // For example, after ... Q R S Sch we add Sch\uFFFF->S
   1041                         // and after ... Q R S Sch Sch\uFFFF St we add St\uFFFF->S.
   1042                         bucket = new Bucket<V>("", current + "\uFFFF", LabelType.NORMAL);
   1043                         bucket.displayBucket = singleBucket;
   1044                         bucketList.add(bucket);
   1045                         hasInvisibleBuckets = true;
   1046                         break;
   1047                     }
   1048                 }
   1049             }
   1050         }
   1051         if (bucketList.size() == 1) {
   1052             // No real labels, show only the underflow label.
   1053             return new BucketList<V>(bucketList, bucketList);
   1054         }
   1055         // overflow bucket
   1056         bucketList.add(new Bucket<V>(getOverflowLabel(), scriptUpperBoundary, LabelType.OVERFLOW)); // final
   1057 
   1058         if (hasPinyin) {
   1059             // Redirect Pinyin buckets.
   1060             Bucket<V> asciiBucket = null;
   1061             for (int i = 0; i < 26; ++i) {
   1062                 if (asciiBuckets[i] != null) {
   1063                     asciiBucket = asciiBuckets[i];
   1064                 }
   1065                 if (pinyinBuckets[i] != null && asciiBucket != null) {
   1066                     pinyinBuckets[i].displayBucket = asciiBucket;
   1067                     hasInvisibleBuckets = true;
   1068                 }
   1069             }
   1070         }
   1071 
   1072         if (!hasInvisibleBuckets) {
   1073             return new BucketList<V>(bucketList, bucketList);
   1074         }
   1075         // Merge inflow buckets that are visually adjacent.
   1076         // Iterate backwards: Merge inflow into overflow rather than the other way around.
   1077         int i = bucketList.size() - 1;
   1078         Bucket<V> nextBucket = bucketList.get(i);
   1079         while (--i > 0) {
   1080             Bucket<V> bucket = bucketList.get(i);
   1081             if (bucket.displayBucket != null) {
   1082                 continue;  // skip invisible buckets
   1083             }
   1084             if (bucket.labelType == LabelType.INFLOW) {
   1085                 if (nextBucket.labelType != LabelType.NORMAL) {
   1086                     bucket.displayBucket = nextBucket;
   1087                     continue;
   1088                 }
   1089             }
   1090             nextBucket = bucket;
   1091         }
   1092 
   1093         ArrayList<Bucket<V>> publicBucketList = new ArrayList<Bucket<V>>();
   1094         for (Bucket<V> bucket : bucketList) {
   1095             if (bucket.displayBucket == null) {
   1096                 publicBucketList.add(bucket);
   1097             }
   1098         }
   1099         return new BucketList<V>(bucketList, publicBucketList);
   1100     }
   1101 
   1102     private static class BucketList<V> implements Iterable<Bucket<V>> {
   1103         private final ArrayList<Bucket<V>> bucketList;
   1104         private final List<Bucket<V>> immutableVisibleList;
   1105 
   1106         private BucketList(ArrayList<Bucket<V>> bucketList, ArrayList<Bucket<V>> publicBucketList) {
   1107             this.bucketList = bucketList;
   1108 
   1109             int displayIndex = 0;
   1110             for (Bucket<V> bucket : publicBucketList) {
   1111                 bucket.displayIndex = displayIndex++;
   1112             }
   1113             immutableVisibleList = Collections.unmodifiableList(publicBucketList);
   1114         }
   1115 
   1116         private int getBucketCount() {
   1117             return immutableVisibleList.size();
   1118         }
   1119 
   1120         private int getBucketIndex(CharSequence name, Collator collatorPrimaryOnly) {
   1121             // binary search
   1122             int start = 0;
   1123             int limit = bucketList.size();
   1124             while ((start + 1) < limit) {
   1125                 int i = (start + limit) / 2;
   1126                 Bucket<V> bucket = bucketList.get(i);
   1127                 int nameVsBucket = collatorPrimaryOnly.compare(name, bucket.lowerBoundary);
   1128                 if (nameVsBucket < 0) {
   1129                     limit = i;
   1130                 } else {
   1131                     start = i;
   1132                 }
   1133             }
   1134             Bucket<V> bucket = bucketList.get(start);
   1135             if (bucket.displayBucket != null) {
   1136                 bucket = bucket.displayBucket;
   1137             }
   1138             return bucket.displayIndex;
   1139         }
   1140 
   1141         /**
   1142          * Private iterator over all the buckets, visible and invisible
   1143          */
   1144         private Iterator<Bucket<V>> fullIterator() {
   1145             return bucketList.iterator();
   1146         }
   1147 
   1148         /**
   1149          * Iterator over just the visible buckets.
   1150          */
   1151         @Override
   1152         public Iterator<Bucket<V>> iterator() {
   1153             return immutableVisibleList.iterator(); // use immutable list to prevent remove().
   1154         }
   1155     }
   1156 
   1157     private static boolean hasMultiplePrimaryWeights(
   1158             RuleBasedCollator coll, long variableTop, String s) {
   1159         long[] ces = coll.internalGetCEs(s);
   1160         boolean seenPrimary = false;
   1161         for (int i = 0; i < ces.length; ++i) {
   1162             long ce = ces[i];
   1163             long p = ce >>> 32;
   1164             if (p > variableTop) {
   1165                 // not primary ignorable
   1166                 if (seenPrimary) {
   1167                     return true;
   1168                 }
   1169                 seenPrimary = true;
   1170             }
   1171         }
   1172         return false;
   1173     }
   1174 
   1175     // TODO: Surely we have at least a ticket for porting these mask values to UCharacter.java?!
   1176     private static final int GC_LU_MASK = 1 << UCharacter.UPPERCASE_LETTER;
   1177     private static final int GC_LL_MASK = 1 << UCharacter.LOWERCASE_LETTER;
   1178     private static final int GC_LT_MASK = 1 << UCharacter.TITLECASE_LETTER;
   1179     private static final int GC_LM_MASK = 1 << UCharacter.MODIFIER_LETTER;
   1180     private static final int GC_LO_MASK = 1 << UCharacter.OTHER_LETTER;
   1181     private static final int GC_L_MASK =
   1182             GC_LU_MASK|GC_LL_MASK|GC_LT_MASK|GC_LM_MASK|GC_LO_MASK;
   1183     private static final int GC_CN_MASK = 1 << UCharacter.GENERAL_OTHER_TYPES;
   1184 
   1185     /**
   1186      * Return a list of the first character in each script. Only exposed for testing.
   1187      *
   1188      * @return list of first characters in each script
   1189      * @deprecated This API is ICU internal, only for testing.
   1190      * @hide original deprecated declaration
   1191      * @hide draft / provisional / internal are hidden on Android
   1192      */
   1193     @Deprecated
   1194     public List<String> getFirstCharactersInScripts() {
   1195         List<String> dest = new ArrayList<String>(200);
   1196         // Fetch the script-first-primary contractions which are defined in the root collator.
   1197         // They all start with U+FDD1.
   1198         UnicodeSet set = new UnicodeSet();
   1199         collatorPrimaryOnly.internalAddContractions(0xFDD1, set);
   1200         if (set.isEmpty()) {
   1201             throw new UnsupportedOperationException(
   1202                     "AlphabeticIndex requires script-first-primary contractions");
   1203         }
   1204         for (String boundary : set) {
   1205             int gcMask = 1 << UCharacter.getType(boundary.codePointAt(1));
   1206             if ((gcMask & (GC_L_MASK | GC_CN_MASK)) == 0) {
   1207                 // Ignore boundaries for the special reordering groups.
   1208                 // Take only those for "real scripts" (where the sample character is a Letter,
   1209                 // and the one for unassigned implicit weights (Cn).
   1210                 continue;
   1211             }
   1212             dest.add(boundary);
   1213         }
   1214         return dest;
   1215     }
   1216 }
   1217