Home | History | Annotate | Download | only in coll
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5 *******************************************************************************
      6 * Copyright (C) 2013-2014, International Business Machines
      7 * Corporation and others.  All Rights Reserved.
      8 *******************************************************************************
      9 * ContractionsAndExpansions.java, ported from collationsets.h/.cpp
     10 *
     11 * C++ version created on: 2013feb09
     12 * created by: Markus W. Scherer
     13 */
     14 
     15 package android.icu.impl.coll;
     16 
     17 import java.util.Iterator;
     18 
     19 import android.icu.impl.Trie2;
     20 import android.icu.text.UnicodeSet;
     21 import android.icu.util.CharsTrie;
     22 import android.icu.util.CharsTrie.Entry;
     23 
     24 /**
     25  * @hide Only a subset of ICU is exposed in Android
     26  */
     27 public final class ContractionsAndExpansions {
     28     // C++: The following fields are @internal, only public for access by callback.
     29     private CollationData data;
     30     private UnicodeSet contractions;
     31     private UnicodeSet expansions;
     32     private CESink sink;
     33     private boolean addPrefixes;
     34     private int checkTailored = 0;  // -1: collected tailored  +1: exclude tailored
     35     private UnicodeSet tailored = new UnicodeSet();
     36     private UnicodeSet ranges;
     37     private StringBuilder unreversedPrefix = new StringBuilder();
     38     private String suffix;
     39     private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH];
     40 
     41     public static interface CESink {
     42         void handleCE(long ce);
     43         void handleExpansion(long ces[], int start, int length);
     44     }
     45 
     46     public ContractionsAndExpansions(UnicodeSet con, UnicodeSet exp, CESink s, boolean prefixes) {
     47         contractions = con;
     48         expansions = exp;
     49         sink = s;
     50         addPrefixes = prefixes;
     51     }
     52 
     53     public void forData(CollationData d) {
     54         // Add all from the data, can be tailoring or base.
     55         if (d.base != null) {
     56             checkTailored = -1;
     57         }
     58         data = d;
     59         Iterator<Trie2.Range> trieIterator = data.trie.iterator();
     60         Trie2.Range range;
     61         while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
     62             enumCnERange(range.startCodePoint, range.endCodePoint, range.value, this);
     63         }
     64         if (d.base == null) {
     65             return;
     66         }
     67         // Add all from the base data but only for un-tailored code points.
     68         tailored.freeze();
     69         checkTailored = 1;
     70         data = d.base;
     71         trieIterator = data.trie.iterator();
     72         while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
     73             enumCnERange(range.startCodePoint, range.endCodePoint, range.value, this);
     74         }
     75     }
     76 
     77     private void enumCnERange(int start, int end, int ce32, ContractionsAndExpansions cne) {
     78         if (cne.checkTailored == 0) {
     79             // There is no tailoring.
     80             // No need to collect nor check the tailored set.
     81         } else if (cne.checkTailored < 0) {
     82             // Collect the set of code points with mappings in the tailoring data.
     83             if (ce32 == Collation.FALLBACK_CE32) {
     84                 return; // fallback to base, not tailored
     85             } else {
     86                 cne.tailored.add(start, end);
     87             }
     88             // checkTailored > 0: Exclude tailored ranges from the base data enumeration.
     89         } else if (start == end) {
     90             if (cne.tailored.contains(start)) {
     91                 return;
     92             }
     93         } else if (cne.tailored.containsSome(start, end)) {
     94             if (cne.ranges == null) {
     95                 cne.ranges = new UnicodeSet();
     96             }
     97             cne.ranges.set(start, end).removeAll(cne.tailored);
     98             int count = cne.ranges.getRangeCount();
     99             for (int i = 0; i < count; ++i) {
    100                 cne.handleCE32(cne.ranges.getRangeStart(i), cne.ranges.getRangeEnd(i), ce32);
    101             }
    102         }
    103         cne.handleCE32(start, end, ce32);
    104     }
    105 
    106     public void forCodePoint(CollationData d, int c) {
    107         int ce32 = d.getCE32(c);
    108         if (ce32 == Collation.FALLBACK_CE32) {
    109             d = d.base;
    110             ce32 = d.getCE32(c);
    111         }
    112         data = d;
    113         handleCE32(c, c, ce32);
    114     }
    115 
    116     private void handleCE32(int start, int end, int ce32) {
    117         for (;;) {
    118             if ((ce32 & 0xff) < Collation.SPECIAL_CE32_LOW_BYTE) {
    119                 // !isSpecialCE32()
    120                 if (sink != null) {
    121                     sink.handleCE(Collation.ceFromSimpleCE32(ce32));
    122                 }
    123                 return;
    124             }
    125             switch (Collation.tagFromCE32(ce32)) {
    126             case Collation.FALLBACK_TAG:
    127                 return;
    128             case Collation.RESERVED_TAG_3:
    129             case Collation.BUILDER_DATA_TAG:
    130             case Collation.LEAD_SURROGATE_TAG:
    131                 // Java porting note: U_INTERNAL_PROGRAM_ERROR is set to errorCode in ICU4C.
    132                 throw new AssertionError(
    133                         String.format("Unexpected CE32 tag type %d for ce32=0x%08x",
    134                                 Collation.tagFromCE32(ce32), ce32));
    135             case Collation.LONG_PRIMARY_TAG:
    136                 if (sink != null) {
    137                     sink.handleCE(Collation.ceFromLongPrimaryCE32(ce32));
    138                 }
    139                 return;
    140             case Collation.LONG_SECONDARY_TAG:
    141                 if (sink != null) {
    142                     sink.handleCE(Collation.ceFromLongSecondaryCE32(ce32));
    143                 }
    144                 return;
    145             case Collation.LATIN_EXPANSION_TAG:
    146                 if (sink != null) {
    147                     ces[0] = Collation.latinCE0FromCE32(ce32);
    148                     ces[1] = Collation.latinCE1FromCE32(ce32);
    149                     sink.handleExpansion(ces, 0, 2);
    150                 }
    151                 // Optimization: If we have a prefix,
    152                 // then the relevant strings have been added already.
    153                 if (unreversedPrefix.length() == 0) {
    154                     addExpansions(start, end);
    155                 }
    156                 return;
    157             case Collation.EXPANSION32_TAG:
    158                 if (sink != null) {
    159                     int idx = Collation.indexFromCE32(ce32);
    160                     int length = Collation.lengthFromCE32(ce32);
    161                     for (int i = 0; i < length; ++i) {
    162                         ces[i] = Collation.ceFromCE32(data.ce32s[idx + i]);
    163                     }
    164                     sink.handleExpansion(ces, 0, length);
    165                 }
    166                 // Optimization: If we have a prefix,
    167                 // then the relevant strings have been added already.
    168                 if (unreversedPrefix.length() == 0) {
    169                     addExpansions(start, end);
    170                 }
    171                 return;
    172             case Collation.EXPANSION_TAG:
    173                 if (sink != null) {
    174                     int idx = Collation.indexFromCE32(ce32);
    175                     int length = Collation.lengthFromCE32(ce32);
    176                     sink.handleExpansion(data.ces, idx, length);
    177                 }
    178                 // Optimization: If we have a prefix,
    179                 // then the relevant strings have been added already.
    180                 if (unreversedPrefix.length() == 0) {
    181                     addExpansions(start, end);
    182                 }
    183                 return;
    184             case Collation.PREFIX_TAG:
    185                 handlePrefixes(start, end, ce32);
    186                 return;
    187             case Collation.CONTRACTION_TAG:
    188                 handleContractions(start, end, ce32);
    189                 return;
    190             case Collation.DIGIT_TAG:
    191                 // Fetch the non-numeric-collation CE32 and continue.
    192                 ce32 = data.ce32s[Collation.indexFromCE32(ce32)];
    193                 break;
    194             case Collation.U0000_TAG:
    195                 assert (start == 0 && end == 0);
    196                 // Fetch the normal ce32 for U+0000 and continue.
    197                 ce32 = data.ce32s[0];
    198                 break;
    199             case Collation.HANGUL_TAG:
    200                 if (sink != null) {
    201                     // TODO: This should be optimized,
    202                     // especially if [start..end] is the complete Hangul range. (assert that)
    203                     UTF16CollationIterator iter = new UTF16CollationIterator(data);
    204                     StringBuilder hangul = new StringBuilder(1);
    205                     for (int c = start; c <= end; ++c) {
    206                         hangul.setLength(0);
    207                         hangul.appendCodePoint(c);
    208                         iter.setText(false, hangul, 0);
    209                         int length = iter.fetchCEs();
    210                         // Ignore the terminating non-CE.
    211                         assert (length >= 2 && iter.getCE(length - 1) == Collation.NO_CE);
    212                         sink.handleExpansion(iter.getCEs(), 0, length - 1);
    213                     }
    214                 }
    215                 // Optimization: If we have a prefix,
    216                 // then the relevant strings have been added already.
    217                 if (unreversedPrefix.length() == 0) {
    218                     addExpansions(start, end);
    219                 }
    220                 return;
    221             case Collation.OFFSET_TAG:
    222                 // Currently no need to send offset CEs to the sink.
    223                 return;
    224             case Collation.IMPLICIT_TAG:
    225                 // Currently no need to send implicit CEs to the sink.
    226                 return;
    227             }
    228         }
    229     }
    230 
    231     private void handlePrefixes(int start, int end, int ce32) {
    232         int index = Collation.indexFromCE32(ce32);
    233         ce32 = data.getCE32FromContexts(index); // Default if no prefix match.
    234         handleCE32(start, end, ce32);
    235         if (!addPrefixes) {
    236             return;
    237         }
    238         CharsTrie.Iterator prefixes = new CharsTrie(data.contexts, index + 2).iterator();
    239         while (prefixes.hasNext()) {
    240             Entry e = prefixes.next();
    241             setPrefix(e.chars);
    242             // Prefix/pre-context mappings are special kinds of contractions
    243             // that always yield expansions.
    244             addStrings(start, end, contractions);
    245             addStrings(start, end, expansions);
    246             handleCE32(start, end, e.value);
    247         }
    248         resetPrefix();
    249     }
    250 
    251     void handleContractions(int start, int end, int ce32) {
    252         int index = Collation.indexFromCE32(ce32);
    253         if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
    254             // No match on the single code point.
    255             // We are underneath a prefix, and the default mapping is just
    256             // a fallback to the mappings for a shorter prefix.
    257             assert (unreversedPrefix.length() != 0);
    258         } else {
    259             ce32 = data.getCE32FromContexts(index); // Default if no suffix match.
    260             assert (!Collation.isContractionCE32(ce32));
    261             handleCE32(start, end, ce32);
    262         }
    263         CharsTrie.Iterator suffixes = new CharsTrie(data.contexts, index + 2).iterator();
    264         while (suffixes.hasNext()) {
    265             Entry e = suffixes.next();
    266             suffix = e.chars.toString();
    267             addStrings(start, end, contractions);
    268             if (unreversedPrefix.length() != 0) {
    269                 addStrings(start, end, expansions);
    270             }
    271             handleCE32(start, end, e.value);
    272         }
    273         suffix = null;
    274     }
    275 
    276     void addExpansions(int start, int end) {
    277         if (unreversedPrefix.length() == 0 && suffix == null) {
    278             if (expansions != null) {
    279                 expansions.add(start, end);
    280             }
    281         } else {
    282             addStrings(start, end, expansions);
    283         }
    284     }
    285 
    286     void addStrings(int start, int end, UnicodeSet set) {
    287         if (set == null) {
    288             return;
    289         }
    290         StringBuilder s = new StringBuilder(unreversedPrefix);
    291         do {
    292             s.appendCodePoint(start);
    293             if (suffix != null) {
    294                 s.append(suffix);
    295             }
    296             set.add(s);
    297             s.setLength(unreversedPrefix.length());
    298         } while (++start <= end);
    299     }
    300 
    301     // Prefixes are reversed in the data structure.
    302     private void setPrefix(CharSequence pfx) {
    303         unreversedPrefix.setLength(0);
    304         unreversedPrefix.append(pfx).reverse();
    305     }
    306 
    307     private void resetPrefix() {
    308         unreversedPrefix.setLength(0);
    309     }
    310 }