Home | History | Annotate | Download | only in coll
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * ContractionsAndExpansions.java, ported from collationsets.h/.cpp
      9 *
     10 * C++ version created on: 2013feb09
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 package com.ibm.icu.impl.coll;
     15 
     16 import java.util.Iterator;
     17 
     18 import com.ibm.icu.impl.Trie2;
     19 import com.ibm.icu.text.UnicodeSet;
     20 import com.ibm.icu.util.CharsTrie;
     21 import com.ibm.icu.util.CharsTrie.Entry;
     22 
     23 public final class ContractionsAndExpansions {
     24     // C++: The following fields are @internal, only public for access by callback.
     25     private CollationData data;
     26     private UnicodeSet contractions;
     27     private UnicodeSet expansions;
     28     private CESink sink;
     29     private boolean addPrefixes;
     30     private int checkTailored = 0;  // -1: collected tailored  +1: exclude tailored
     31     private UnicodeSet tailored = new UnicodeSet();
     32     private UnicodeSet ranges;
     33     private StringBuilder unreversedPrefix = new StringBuilder();
     34     private String suffix;
     35     private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH];
     36 
     37     public static interface CESink {
     38         void handleCE(long ce);
     39         void handleExpansion(long ces[], int start, int length);
     40     }
     41 
     42     public ContractionsAndExpansions(UnicodeSet con, UnicodeSet exp, CESink s, boolean prefixes) {
     43         contractions = con;
     44         expansions = exp;
     45         sink = s;
     46         addPrefixes = prefixes;
     47     }
     48 
     49     public void forData(CollationData d) {
     50         // Add all from the data, can be tailoring or base.
     51         if (d.base != null) {
     52             checkTailored = -1;
     53         }
     54         data = d;
     55         Iterator<Trie2.Range> trieIterator = data.trie.iterator();
     56         Trie2.Range range;
     57         while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
     58             enumCnERange(range.startCodePoint, range.endCodePoint, range.value, this);
     59         }
     60         if (d.base == null) {
     61             return;
     62         }
     63         // Add all from the base data but only for un-tailored code points.
     64         tailored.freeze();
     65         checkTailored = 1;
     66         data = d.base;
     67         trieIterator = data.trie.iterator();
     68         while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
     69             enumCnERange(range.startCodePoint, range.endCodePoint, range.value, this);
     70         }
     71     }
     72 
     73     private void enumCnERange(int start, int end, int ce32, ContractionsAndExpansions cne) {
     74         if (cne.checkTailored == 0) {
     75             // There is no tailoring.
     76             // No need to collect nor check the tailored set.
     77         } else if (cne.checkTailored < 0) {
     78             // Collect the set of code points with mappings in the tailoring data.
     79             if (ce32 == Collation.FALLBACK_CE32) {
     80                 return; // fallback to base, not tailored
     81             } else {
     82                 cne.tailored.add(start, end);
     83             }
     84             // checkTailored > 0: Exclude tailored ranges from the base data enumeration.
     85         } else if (start == end) {
     86             if (cne.tailored.contains(start)) {
     87                 return;
     88             }
     89         } else if (cne.tailored.containsSome(start, end)) {
     90             if (cne.ranges == null) {
     91                 cne.ranges = new UnicodeSet();
     92             }
     93             cne.ranges.set(start, end).removeAll(cne.tailored);
     94             int count = cne.ranges.getRangeCount();
     95             for (int i = 0; i < count; ++i) {
     96                 cne.handleCE32(cne.ranges.getRangeStart(i), cne.ranges.getRangeEnd(i), ce32);
     97             }
     98         }
     99         cne.handleCE32(start, end, ce32);
    100     }
    101 
    102     public void forCodePoint(CollationData d, int c) {
    103         int ce32 = d.getCE32(c);
    104         if (ce32 == Collation.FALLBACK_CE32) {
    105             d = d.base;
    106             ce32 = d.getCE32(c);
    107         }
    108         data = d;
    109         handleCE32(c, c, ce32);
    110     }
    111 
    112     private void handleCE32(int start, int end, int ce32) {
    113         for (;;) {
    114             if ((ce32 & 0xff) < Collation.SPECIAL_CE32_LOW_BYTE) {
    115                 // !isSpecialCE32()
    116                 if (sink != null) {
    117                     sink.handleCE(Collation.ceFromSimpleCE32(ce32));
    118                 }
    119                 return;
    120             }
    121             switch (Collation.tagFromCE32(ce32)) {
    122             case Collation.FALLBACK_TAG:
    123                 return;
    124             case Collation.RESERVED_TAG_3:
    125             case Collation.BUILDER_DATA_TAG:
    126             case Collation.LEAD_SURROGATE_TAG:
    127                 // Java porting note: U_INTERNAL_PROGRAM_ERROR is set to errorCode in ICU4C.
    128                 throw new AssertionError(
    129                         String.format("Unexpected CE32 tag type %d for ce32=0x%08x",
    130                                 Collation.tagFromCE32(ce32), ce32));
    131             case Collation.LONG_PRIMARY_TAG:
    132                 if (sink != null) {
    133                     sink.handleCE(Collation.ceFromLongPrimaryCE32(ce32));
    134                 }
    135                 return;
    136             case Collation.LONG_SECONDARY_TAG:
    137                 if (sink != null) {
    138                     sink.handleCE(Collation.ceFromLongSecondaryCE32(ce32));
    139                 }
    140                 return;
    141             case Collation.LATIN_EXPANSION_TAG:
    142                 if (sink != null) {
    143                     ces[0] = Collation.latinCE0FromCE32(ce32);
    144                     ces[1] = Collation.latinCE1FromCE32(ce32);
    145                     sink.handleExpansion(ces, 0, 2);
    146                 }
    147                 // Optimization: If we have a prefix,
    148                 // then the relevant strings have been added already.
    149                 if (unreversedPrefix.length() == 0) {
    150                     addExpansions(start, end);
    151                 }
    152                 return;
    153             case Collation.EXPANSION32_TAG:
    154                 if (sink != null) {
    155                     int idx = Collation.indexFromCE32(ce32);
    156                     int length = Collation.lengthFromCE32(ce32);
    157                     for (int i = 0; i < length; ++i) {
    158                         ces[i] = Collation.ceFromCE32(data.ce32s[idx + i]);
    159                     }
    160                     sink.handleExpansion(ces, 0, length);
    161                 }
    162                 // Optimization: If we have a prefix,
    163                 // then the relevant strings have been added already.
    164                 if (unreversedPrefix.length() == 0) {
    165                     addExpansions(start, end);
    166                 }
    167                 return;
    168             case Collation.EXPANSION_TAG:
    169                 if (sink != null) {
    170                     int idx = Collation.indexFromCE32(ce32);
    171                     int length = Collation.lengthFromCE32(ce32);
    172                     sink.handleExpansion(data.ces, idx, length);
    173                 }
    174                 // Optimization: If we have a prefix,
    175                 // then the relevant strings have been added already.
    176                 if (unreversedPrefix.length() == 0) {
    177                     addExpansions(start, end);
    178                 }
    179                 return;
    180             case Collation.PREFIX_TAG:
    181                 handlePrefixes(start, end, ce32);
    182                 return;
    183             case Collation.CONTRACTION_TAG:
    184                 handleContractions(start, end, ce32);
    185                 return;
    186             case Collation.DIGIT_TAG:
    187                 // Fetch the non-numeric-collation CE32 and continue.
    188                 ce32 = data.ce32s[Collation.indexFromCE32(ce32)];
    189                 break;
    190             case Collation.U0000_TAG:
    191                 assert (start == 0 && end == 0);
    192                 // Fetch the normal ce32 for U+0000 and continue.
    193                 ce32 = data.ce32s[0];
    194                 break;
    195             case Collation.HANGUL_TAG:
    196                 if (sink != null) {
    197                     // TODO: This should be optimized,
    198                     // especially if [start..end] is the complete Hangul range. (assert that)
    199                     UTF16CollationIterator iter = new UTF16CollationIterator(data);
    200                     StringBuilder hangul = new StringBuilder(1);
    201                     for (int c = start; c <= end; ++c) {
    202                         hangul.setLength(0);
    203                         hangul.appendCodePoint(c);
    204                         iter.setText(false, hangul, 0);
    205                         int length = iter.fetchCEs();
    206                         // Ignore the terminating non-CE.
    207                         assert (length >= 2 && iter.getCE(length - 1) == Collation.NO_CE);
    208                         sink.handleExpansion(iter.getCEs(), 0, length - 1);
    209                     }
    210                 }
    211                 // Optimization: If we have a prefix,
    212                 // then the relevant strings have been added already.
    213                 if (unreversedPrefix.length() == 0) {
    214                     addExpansions(start, end);
    215                 }
    216                 return;
    217             case Collation.OFFSET_TAG:
    218                 // Currently no need to send offset CEs to the sink.
    219                 return;
    220             case Collation.IMPLICIT_TAG:
    221                 // Currently no need to send implicit CEs to the sink.
    222                 return;
    223             }
    224         }
    225     }
    226 
    227     private void handlePrefixes(int start, int end, int ce32) {
    228         int index = Collation.indexFromCE32(ce32);
    229         ce32 = data.getCE32FromContexts(index); // Default if no prefix match.
    230         handleCE32(start, end, ce32);
    231         if (!addPrefixes) {
    232             return;
    233         }
    234         CharsTrie.Iterator prefixes = new CharsTrie(data.contexts, index + 2).iterator();
    235         while (prefixes.hasNext()) {
    236             Entry e = prefixes.next();
    237             setPrefix(e.chars);
    238             // Prefix/pre-context mappings are special kinds of contractions
    239             // that always yield expansions.
    240             addStrings(start, end, contractions);
    241             addStrings(start, end, expansions);
    242             handleCE32(start, end, e.value);
    243         }
    244         resetPrefix();
    245     }
    246 
    247     void handleContractions(int start, int end, int ce32) {
    248         int index = Collation.indexFromCE32(ce32);
    249         if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
    250             // No match on the single code point.
    251             // We are underneath a prefix, and the default mapping is just
    252             // a fallback to the mappings for a shorter prefix.
    253             assert (unreversedPrefix.length() != 0);
    254         } else {
    255             ce32 = data.getCE32FromContexts(index); // Default if no suffix match.
    256             assert (!Collation.isContractionCE32(ce32));
    257             handleCE32(start, end, ce32);
    258         }
    259         CharsTrie.Iterator suffixes = new CharsTrie(data.contexts, index + 2).iterator();
    260         while (suffixes.hasNext()) {
    261             Entry e = suffixes.next();
    262             suffix = e.chars.toString();
    263             addStrings(start, end, contractions);
    264             if (unreversedPrefix.length() != 0) {
    265                 addStrings(start, end, expansions);
    266             }
    267             handleCE32(start, end, e.value);
    268         }
    269         suffix = null;
    270     }
    271 
    272     void addExpansions(int start, int end) {
    273         if (unreversedPrefix.length() == 0 && suffix == null) {
    274             if (expansions != null) {
    275                 expansions.add(start, end);
    276             }
    277         } else {
    278             addStrings(start, end, expansions);
    279         }
    280     }
    281 
    282     void addStrings(int start, int end, UnicodeSet set) {
    283         if (set == null) {
    284             return;
    285         }
    286         StringBuilder s = new StringBuilder(unreversedPrefix);
    287         do {
    288             s.appendCodePoint(start);
    289             if (suffix != null) {
    290                 s.append(suffix);
    291             }
    292             set.add(s);
    293             s.setLength(unreversedPrefix.length());
    294         } while (++start <= end);
    295     }
    296 
    297     // Prefixes are reversed in the data structure.
    298     private void setPrefix(CharSequence pfx) {
    299         unreversedPrefix.setLength(0);
    300         unreversedPrefix.append(pfx).reverse();
    301     }
    302 
    303     private void resetPrefix() {
    304         unreversedPrefix.setLength(0);
    305     }
    306 }