Home | History | Annotate | Download | only in coll
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5 *******************************************************************************
      6 * Copyright (C) 2013-2014, International Business Machines
      7 * Corporation and others.  All Rights Reserved.
      8 *******************************************************************************
      9 * TailoredSet.java, ported from collationsets.h/.cpp
     10 *
     11 * C++ version created on: 2013feb09
     12 * created by: Markus W. Scherer
     13 */
     14 
     15 package android.icu.impl.coll;
     16 
     17 import java.util.Iterator;
     18 
     19 import android.icu.impl.Normalizer2Impl.Hangul;
     20 import android.icu.impl.Trie2;
     21 import android.icu.impl.Utility;
     22 import android.icu.text.UnicodeSet;
     23 import android.icu.util.CharsTrie;
     24 import android.icu.util.CharsTrie.Entry;
     25 
     26 /**
     27  * Finds the set of characters and strings that sort differently in the tailoring
     28  * from the base data.
     29  *
     30  * Every mapping in the tailoring needs to be compared to the base,
     31  * because some mappings are copied for optimization, and
     32  * all contractions for a character are copied if any contractions for that character
     33  * are added, modified or removed.
     34  *
     35  * It might be simpler to re-parse the rule string, but:
     36  * - That would require duplicating some of the from-rules builder code.
     37  * - That would make the runtime code depend on the builder.
     38  * - That would only work if we have the rule string, and we allow users to
     39  *   omit the rule string from data files.
     40  * @hide Only a subset of ICU is exposed in Android
     41  */
     42 public final class TailoredSet {
     43 
     44     private CollationData data;
     45     private CollationData baseData;
     46     private UnicodeSet tailored;
     47     private StringBuilder unreversedPrefix = new StringBuilder();
     48     private String suffix;
     49 
     50     public TailoredSet(UnicodeSet t) {
     51         tailored = t;
     52     }
     53 
     54     public void forData(CollationData d) {
     55         data = d;
     56         baseData = d.base;
     57         assert (baseData != null);
     58         // utrie2_enum(data->trie, NULL, enumTailoredRange, this);
     59         Iterator<Trie2.Range> trieIterator = data.trie.iterator();
     60         Trie2.Range range;
     61         while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
     62             enumTailoredRange(range.startCodePoint, range.endCodePoint, range.value, this);
     63         }
     64     }
     65 
     66     private void enumTailoredRange(int start, int end, int ce32, TailoredSet ts) {
     67         if (ce32 == Collation.FALLBACK_CE32) {
     68             return; // fallback to base, not tailored
     69         }
     70         ts.handleCE32(start, end, ce32);
     71     }
     72 
     73     // Java porting note: ICU4C returns U_SUCCESS(error) and it's not applicable to ICU4J.
     74     //  Also, ICU4C requires handleCE32() to be public because it is used by the callback
     75     //  function (enumTailoredRange()). This is not necessary for Java implementation.
     76     private void handleCE32(int start, int end, int ce32) {
     77         assert (ce32 != Collation.FALLBACK_CE32);
     78         if (Collation.isSpecialCE32(ce32)) {
     79             ce32 = data.getIndirectCE32(ce32);
     80             if (ce32 == Collation.FALLBACK_CE32) {
     81                 return;
     82             }
     83         }
     84         do {
     85             int baseCE32 = baseData.getFinalCE32(baseData.getCE32(start));
     86             // Do not just continue if ce32 == baseCE32 because
     87             // contractions and expansions in different data objects
     88             // normally differ even if they have the same data offsets.
     89             if (Collation.isSelfContainedCE32(ce32) && Collation.isSelfContainedCE32(baseCE32)) {
     90                 // fastpath
     91                 if (ce32 != baseCE32) {
     92                     tailored.add(start);
     93                 }
     94             } else {
     95                 compare(start, ce32, baseCE32);
     96             }
     97         } while (++start <= end);
     98     }
     99 
    100     private void compare(int c, int ce32, int baseCE32) {
    101         if (Collation.isPrefixCE32(ce32)) {
    102             int dataIndex = Collation.indexFromCE32(ce32);
    103             ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex));
    104             if (Collation.isPrefixCE32(baseCE32)) {
    105                 int baseIndex = Collation.indexFromCE32(baseCE32);
    106                 baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
    107                 comparePrefixes(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
    108             } else {
    109                 addPrefixes(data, c, data.contexts, dataIndex + 2);
    110             }
    111         } else if (Collation.isPrefixCE32(baseCE32)) {
    112             int baseIndex = Collation.indexFromCE32(baseCE32);
    113             baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
    114             addPrefixes(baseData, c, baseData.contexts, baseIndex + 2);
    115         }
    116 
    117         if (Collation.isContractionCE32(ce32)) {
    118             int dataIndex = Collation.indexFromCE32(ce32);
    119             if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
    120                 ce32 = Collation.NO_CE32;
    121             } else {
    122                 ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex));
    123             }
    124             if (Collation.isContractionCE32(baseCE32)) {
    125                 int baseIndex = Collation.indexFromCE32(baseCE32);
    126                 if ((baseCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
    127                     baseCE32 = Collation.NO_CE32;
    128                 } else {
    129                     baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
    130                 }
    131                 compareContractions(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
    132             } else {
    133                 addContractions(c, data.contexts, dataIndex + 2);
    134             }
    135         } else if (Collation.isContractionCE32(baseCE32)) {
    136             int baseIndex = Collation.indexFromCE32(baseCE32);
    137             baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
    138             addContractions(c, baseData.contexts, baseIndex + 2);
    139         }
    140 
    141         int tag;
    142         if (Collation.isSpecialCE32(ce32)) {
    143             tag = Collation.tagFromCE32(ce32);
    144             assert (tag != Collation.PREFIX_TAG);
    145             assert (tag != Collation.CONTRACTION_TAG);
    146             // Currently, the tailoring data builder does not write offset tags.
    147             // They might be useful for saving space,
    148             // but they would complicate the builder,
    149             // and in tailorings we assume that performance of tailored characters is more important.
    150             assert (tag != Collation.OFFSET_TAG);
    151         } else {
    152             tag = -1;
    153         }
    154         int baseTag;
    155         if (Collation.isSpecialCE32(baseCE32)) {
    156             baseTag = Collation.tagFromCE32(baseCE32);
    157             assert (baseTag != Collation.PREFIX_TAG);
    158             assert (baseTag != Collation.CONTRACTION_TAG);
    159         } else {
    160             baseTag = -1;
    161         }
    162 
    163         // Non-contextual mappings, expansions, etc.
    164         if (baseTag == Collation.OFFSET_TAG) {
    165             // We might be comparing a tailoring CE which is a copy of
    166             // a base offset-tag CE, via the [optimize [set]] syntax
    167             // or when a single-character mapping was copied for tailored contractions.
    168             // Offset tags always result in long-primary CEs,
    169             // with common secondary/tertiary weights.
    170             if (!Collation.isLongPrimaryCE32(ce32)) {
    171                 add(c);
    172                 return;
    173             }
    174             long dataCE = baseData.ces[Collation.indexFromCE32(baseCE32)];
    175             long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE);
    176             if (Collation.primaryFromLongPrimaryCE32(ce32) != p) {
    177                 add(c);
    178                 return;
    179             }
    180         }
    181 
    182         if (tag != baseTag) {
    183             add(c);
    184             return;
    185         }
    186 
    187         if (tag == Collation.EXPANSION32_TAG) {
    188             int length = Collation.lengthFromCE32(ce32);
    189             int baseLength = Collation.lengthFromCE32(baseCE32);
    190 
    191             if (length != baseLength) {
    192                 add(c);
    193                 return;
    194             }
    195 
    196             int idx0 = Collation.indexFromCE32(ce32);
    197             int idx1 = Collation.indexFromCE32(baseCE32);
    198 
    199             for (int i = 0; i < length; ++i) {
    200                 if (data.ce32s[idx0 + i] != baseData.ce32s[idx1 + i]) {
    201                     add(c);
    202                     break;
    203                 }
    204             }
    205         } else if (tag == Collation.EXPANSION_TAG) {
    206             int length = Collation.lengthFromCE32(ce32);
    207             int baseLength = Collation.lengthFromCE32(baseCE32);
    208 
    209             if (length != baseLength) {
    210                 add(c);
    211                 return;
    212             }
    213 
    214             int idx0 = Collation.indexFromCE32(ce32);
    215             int idx1 = Collation.indexFromCE32(baseCE32);
    216 
    217             for (int i = 0; i < length; ++i) {
    218                 if (data.ces[idx0 + i] != baseData.ces[idx1 + i]) {
    219                     add(c);
    220                     break;
    221                 }
    222             }
    223         } else if (tag == Collation.HANGUL_TAG) {
    224             StringBuilder jamos = new StringBuilder();
    225             int length = Hangul.decompose(c, jamos);
    226             if (tailored.contains(jamos.charAt(0)) || tailored.contains(jamos.charAt(1))
    227                     || (length == 3 && tailored.contains(jamos.charAt(2)))) {
    228                 add(c);
    229             }
    230         } else if (ce32 != baseCE32) {
    231             add(c);
    232         }
    233     }
    234 
    235     private void comparePrefixes(int c, CharSequence p, int pidx, CharSequence q, int qidx) {
    236         // Parallel iteration over prefixes of both tables.
    237         CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator();
    238         CharsTrie.Iterator basePrefixes = new CharsTrie(q, qidx).iterator();
    239         String tp = null; // Tailoring prefix.
    240         String bp = null; // Base prefix.
    241         // Use a string with a U+FFFF as the limit sentinel.
    242         // U+FFFF is untailorable and will not occur in prefixes.
    243         String none = "\uffff";
    244         Entry te = null, be = null;
    245         for (;;) {
    246             if (tp == null) {
    247                 if (prefixes.hasNext()) {
    248                     te = prefixes.next();
    249                     tp = te.chars.toString();
    250                 } else {
    251                     te = null;
    252                     tp = none;
    253                 }
    254             }
    255             if (bp == null) {
    256                 if (basePrefixes.hasNext()) {
    257                     be = basePrefixes.next();
    258                     bp = be.chars.toString();
    259                 } else {
    260                     be = null;
    261                     bp = none;
    262                 }
    263             }
    264             if (Utility.sameObjects(tp, none) && Utility.sameObjects(bp, none)) {
    265                 break;
    266             }
    267             int cmp = tp.compareTo(bp);
    268             if (cmp < 0) {
    269                 // tp occurs in the tailoring but not in the base.
    270                 assert (te != null);
    271                 addPrefix(data, tp, c, te.value);
    272                 te = null;
    273                 tp = null;
    274             } else if (cmp > 0) {
    275                 // bp occurs in the base but not in the tailoring.
    276                 assert (be != null);
    277                 addPrefix(baseData, bp, c, be.value);
    278                 be = null;
    279                 bp = null;
    280             } else {
    281                 setPrefix(tp);
    282                 assert (te != null && be != null);
    283                 compare(c, te.value, be.value);
    284                 resetPrefix();
    285                 te = be = null;
    286                 tp = bp = null;
    287             }
    288         }
    289     }
    290 
    291     private void compareContractions(int c, CharSequence p, int pidx, CharSequence q, int qidx) {
    292         // Parallel iteration over suffixes of both tables.
    293         CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator();
    294         CharsTrie.Iterator baseSuffixes = new CharsTrie(q, qidx).iterator();
    295         String ts = null; // Tailoring suffix.
    296         String bs = null; // Base suffix.
    297         // Use a string with two U+FFFF as the limit sentinel.
    298         // U+FFFF is untailorable and will not occur in contractions except maybe
    299         // as a single suffix character for a root-collator boundary contraction.
    300         String none = "\uffff\uffff";
    301         Entry te = null, be = null;
    302         for (;;) {
    303             if (ts == null) {
    304                 if (suffixes.hasNext()) {
    305                     te = suffixes.next();
    306                     ts = te.chars.toString();
    307                 } else {
    308                     te = null;
    309                     ts = none;
    310                 }
    311             }
    312             if (bs == null) {
    313                 if (baseSuffixes.hasNext()) {
    314                     be = baseSuffixes.next();
    315                     bs = be.chars.toString();
    316                 } else {
    317                     be = null;
    318                     bs = none;
    319                 }
    320             }
    321             if (Utility.sameObjects(ts, none) && Utility.sameObjects(bs, none)) {
    322                 break;
    323             }
    324             int cmp = ts.compareTo(bs);
    325             if (cmp < 0) {
    326                 // ts occurs in the tailoring but not in the base.
    327                 addSuffix(c, ts);
    328                 te = null;
    329                 ts = null;
    330             } else if (cmp > 0) {
    331                 // bs occurs in the base but not in the tailoring.
    332                 addSuffix(c, bs);
    333                 be = null;
    334                 bs = null;
    335             } else {
    336                 suffix = ts;
    337                 compare(c, te.value, be.value);
    338                 suffix = null;
    339                 te = be = null;
    340                 ts = bs = null;
    341             }
    342         }
    343     }
    344 
    345     private void addPrefixes(CollationData d, int c, CharSequence p, int pidx) {
    346         CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator();
    347         while (prefixes.hasNext()) {
    348             Entry e = prefixes.next();
    349             addPrefix(d, e.chars, c, e.value);
    350         }
    351     }
    352 
    353     private void addPrefix(CollationData d, CharSequence pfx, int c, int ce32) {
    354         setPrefix(pfx);
    355         ce32 = d.getFinalCE32(ce32);
    356         if (Collation.isContractionCE32(ce32)) {
    357             int idx = Collation.indexFromCE32(ce32);
    358             addContractions(c, d.contexts, idx + 2);
    359         }
    360         tailored.add(new StringBuilder(unreversedPrefix.appendCodePoint(c)));
    361         resetPrefix();
    362     }
    363 
    364     private void addContractions(int c, CharSequence p, int pidx) {
    365         CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator();
    366         while (suffixes.hasNext()) {
    367             Entry e = suffixes.next();
    368             addSuffix(c, e.chars);
    369         }
    370     }
    371 
    372     private void addSuffix(int c, CharSequence sfx) {
    373         tailored.add(new StringBuilder(unreversedPrefix).appendCodePoint(c).append(sfx));
    374     }
    375 
    376     private void add(int c) {
    377         if (unreversedPrefix.length() == 0 && suffix == null) {
    378             tailored.add(c);
    379         } else {
    380             StringBuilder s = new StringBuilder(unreversedPrefix);
    381             s.appendCodePoint(c);
    382             if (suffix != null) {
    383                 s.append(suffix);
    384             }
    385             tailored.add(s);
    386         }
    387     }
    388 
    389     // Prefixes are reversed in the data structure.
    390     private void setPrefix(CharSequence pfx) {
    391         unreversedPrefix.setLength(0);
    392         unreversedPrefix.append(pfx).reverse();
    393     }
    394 
    395     private void resetPrefix() {
    396         unreversedPrefix.setLength(0);
    397     }
    398 }
    399 
    400