Home | History | Annotate | Download | only in coll
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4 *******************************************************************************
      5 * Copyright (C) 2013-2014, International Business Machines
      6 * Corporation and others.  All Rights Reserved.
      7 *******************************************************************************
      8 * TailoredSet.java, ported from collationsets.h/.cpp
      9 *
     10 * C++ version created on: 2013feb09
     11 * created by: Markus W. Scherer
     12 */
     13 
     14 package com.ibm.icu.impl.coll;
     15 
     16 import java.util.Iterator;
     17 
     18 import com.ibm.icu.impl.Normalizer2Impl.Hangul;
     19 import com.ibm.icu.impl.Trie2;
     20 import com.ibm.icu.impl.Utility;
     21 import com.ibm.icu.text.UnicodeSet;
     22 import com.ibm.icu.util.CharsTrie;
     23 import com.ibm.icu.util.CharsTrie.Entry;
     24 
     25 /**
     26  * Finds the set of characters and strings that sort differently in the tailoring
     27  * from the base data.
     28  *
     29  * Every mapping in the tailoring needs to be compared to the base,
     30  * because some mappings are copied for optimization, and
     31  * all contractions for a character are copied if any contractions for that character
     32  * are added, modified or removed.
     33  *
     34  * It might be simpler to re-parse the rule string, but:
     35  * - That would require duplicating some of the from-rules builder code.
     36  * - That would make the runtime code depend on the builder.
     37  * - That would only work if we have the rule string, and we allow users to
     38  *   omit the rule string from data files.
     39  */
     40 public final class TailoredSet {
     41 
     42     private CollationData data;
     43     private CollationData baseData;
     44     private UnicodeSet tailored;
     45     private StringBuilder unreversedPrefix = new StringBuilder();
     46     private String suffix;
     47 
     48     public TailoredSet(UnicodeSet t) {
     49         tailored = t;
     50     }
     51 
     52     public void forData(CollationData d) {
     53         data = d;
     54         baseData = d.base;
     55         assert (baseData != null);
     56         // utrie2_enum(data->trie, NULL, enumTailoredRange, this);
     57         Iterator<Trie2.Range> trieIterator = data.trie.iterator();
     58         Trie2.Range range;
     59         while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) {
     60             enumTailoredRange(range.startCodePoint, range.endCodePoint, range.value, this);
     61         }
     62     }
     63 
     64     private void enumTailoredRange(int start, int end, int ce32, TailoredSet ts) {
     65         if (ce32 == Collation.FALLBACK_CE32) {
     66             return; // fallback to base, not tailored
     67         }
     68         ts.handleCE32(start, end, ce32);
     69     }
     70 
     71     // Java porting note: ICU4C returns U_SUCCESS(error) and it's not applicable to ICU4J.
     72     //  Also, ICU4C requires handleCE32() to be public because it is used by the callback
     73     //  function (enumTailoredRange()). This is not necessary for Java implementation.
     74     private void handleCE32(int start, int end, int ce32) {
     75         assert (ce32 != Collation.FALLBACK_CE32);
     76         if (Collation.isSpecialCE32(ce32)) {
     77             ce32 = data.getIndirectCE32(ce32);
     78             if (ce32 == Collation.FALLBACK_CE32) {
     79                 return;
     80             }
     81         }
     82         do {
     83             int baseCE32 = baseData.getFinalCE32(baseData.getCE32(start));
     84             // Do not just continue if ce32 == baseCE32 because
     85             // contractions and expansions in different data objects
     86             // normally differ even if they have the same data offsets.
     87             if (Collation.isSelfContainedCE32(ce32) && Collation.isSelfContainedCE32(baseCE32)) {
     88                 // fastpath
     89                 if (ce32 != baseCE32) {
     90                     tailored.add(start);
     91                 }
     92             } else {
     93                 compare(start, ce32, baseCE32);
     94             }
     95         } while (++start <= end);
     96     }
     97 
     98     private void compare(int c, int ce32, int baseCE32) {
     99         if (Collation.isPrefixCE32(ce32)) {
    100             int dataIndex = Collation.indexFromCE32(ce32);
    101             ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex));
    102             if (Collation.isPrefixCE32(baseCE32)) {
    103                 int baseIndex = Collation.indexFromCE32(baseCE32);
    104                 baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
    105                 comparePrefixes(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
    106             } else {
    107                 addPrefixes(data, c, data.contexts, dataIndex + 2);
    108             }
    109         } else if (Collation.isPrefixCE32(baseCE32)) {
    110             int baseIndex = Collation.indexFromCE32(baseCE32);
    111             baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
    112             addPrefixes(baseData, c, baseData.contexts, baseIndex + 2);
    113         }
    114 
    115         if (Collation.isContractionCE32(ce32)) {
    116             int dataIndex = Collation.indexFromCE32(ce32);
    117             if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
    118                 ce32 = Collation.NO_CE32;
    119             } else {
    120                 ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex));
    121             }
    122             if (Collation.isContractionCE32(baseCE32)) {
    123                 int baseIndex = Collation.indexFromCE32(baseCE32);
    124                 if ((baseCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) {
    125                     baseCE32 = Collation.NO_CE32;
    126                 } else {
    127                     baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
    128                 }
    129                 compareContractions(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2);
    130             } else {
    131                 addContractions(c, data.contexts, dataIndex + 2);
    132             }
    133         } else if (Collation.isContractionCE32(baseCE32)) {
    134             int baseIndex = Collation.indexFromCE32(baseCE32);
    135             baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex));
    136             addContractions(c, baseData.contexts, baseIndex + 2);
    137         }
    138 
    139         int tag;
    140         if (Collation.isSpecialCE32(ce32)) {
    141             tag = Collation.tagFromCE32(ce32);
    142             assert (tag != Collation.PREFIX_TAG);
    143             assert (tag != Collation.CONTRACTION_TAG);
    144             // Currently, the tailoring data builder does not write offset tags.
    145             // They might be useful for saving space,
    146             // but they would complicate the builder,
    147             // and in tailorings we assume that performance of tailored characters is more important.
    148             assert (tag != Collation.OFFSET_TAG);
    149         } else {
    150             tag = -1;
    151         }
    152         int baseTag;
    153         if (Collation.isSpecialCE32(baseCE32)) {
    154             baseTag = Collation.tagFromCE32(baseCE32);
    155             assert (baseTag != Collation.PREFIX_TAG);
    156             assert (baseTag != Collation.CONTRACTION_TAG);
    157         } else {
    158             baseTag = -1;
    159         }
    160 
    161         // Non-contextual mappings, expansions, etc.
    162         if (baseTag == Collation.OFFSET_TAG) {
    163             // We might be comparing a tailoring CE which is a copy of
    164             // a base offset-tag CE, via the [optimize [set]] syntax
    165             // or when a single-character mapping was copied for tailored contractions.
    166             // Offset tags always result in long-primary CEs,
    167             // with common secondary/tertiary weights.
    168             if (!Collation.isLongPrimaryCE32(ce32)) {
    169                 add(c);
    170                 return;
    171             }
    172             long dataCE = baseData.ces[Collation.indexFromCE32(baseCE32)];
    173             long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE);
    174             if (Collation.primaryFromLongPrimaryCE32(ce32) != p) {
    175                 add(c);
    176                 return;
    177             }
    178         }
    179 
    180         if (tag != baseTag) {
    181             add(c);
    182             return;
    183         }
    184 
    185         if (tag == Collation.EXPANSION32_TAG) {
    186             int length = Collation.lengthFromCE32(ce32);
    187             int baseLength = Collation.lengthFromCE32(baseCE32);
    188 
    189             if (length != baseLength) {
    190                 add(c);
    191                 return;
    192             }
    193 
    194             int idx0 = Collation.indexFromCE32(ce32);
    195             int idx1 = Collation.indexFromCE32(baseCE32);
    196 
    197             for (int i = 0; i < length; ++i) {
    198                 if (data.ce32s[idx0 + i] != baseData.ce32s[idx1 + i]) {
    199                     add(c);
    200                     break;
    201                 }
    202             }
    203         } else if (tag == Collation.EXPANSION_TAG) {
    204             int length = Collation.lengthFromCE32(ce32);
    205             int baseLength = Collation.lengthFromCE32(baseCE32);
    206 
    207             if (length != baseLength) {
    208                 add(c);
    209                 return;
    210             }
    211 
    212             int idx0 = Collation.indexFromCE32(ce32);
    213             int idx1 = Collation.indexFromCE32(baseCE32);
    214 
    215             for (int i = 0; i < length; ++i) {
    216                 if (data.ces[idx0 + i] != baseData.ces[idx1 + i]) {
    217                     add(c);
    218                     break;
    219                 }
    220             }
    221         } else if (tag == Collation.HANGUL_TAG) {
    222             StringBuilder jamos = new StringBuilder();
    223             int length = Hangul.decompose(c, jamos);
    224             if (tailored.contains(jamos.charAt(0)) || tailored.contains(jamos.charAt(1))
    225                     || (length == 3 && tailored.contains(jamos.charAt(2)))) {
    226                 add(c);
    227             }
    228         } else if (ce32 != baseCE32) {
    229             add(c);
    230         }
    231     }
    232 
    233     private void comparePrefixes(int c, CharSequence p, int pidx, CharSequence q, int qidx) {
    234         // Parallel iteration over prefixes of both tables.
    235         CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator();
    236         CharsTrie.Iterator basePrefixes = new CharsTrie(q, qidx).iterator();
    237         String tp = null; // Tailoring prefix.
    238         String bp = null; // Base prefix.
    239         // Use a string with a U+FFFF as the limit sentinel.
    240         // U+FFFF is untailorable and will not occur in prefixes.
    241         String none = "\uffff";
    242         Entry te = null, be = null;
    243         for (;;) {
    244             if (tp == null) {
    245                 if (prefixes.hasNext()) {
    246                     te = prefixes.next();
    247                     tp = te.chars.toString();
    248                 } else {
    249                     te = null;
    250                     tp = none;
    251                 }
    252             }
    253             if (bp == null) {
    254                 if (basePrefixes.hasNext()) {
    255                     be = basePrefixes.next();
    256                     bp = be.chars.toString();
    257                 } else {
    258                     be = null;
    259                     bp = none;
    260                 }
    261             }
    262             if (Utility.sameObjects(tp, none) && Utility.sameObjects(bp, none)) {
    263                 break;
    264             }
    265             int cmp = tp.compareTo(bp);
    266             if (cmp < 0) {
    267                 // tp occurs in the tailoring but not in the base.
    268                 assert (te != null);
    269                 addPrefix(data, tp, c, te.value);
    270                 te = null;
    271                 tp = null;
    272             } else if (cmp > 0) {
    273                 // bp occurs in the base but not in the tailoring.
    274                 assert (be != null);
    275                 addPrefix(baseData, bp, c, be.value);
    276                 be = null;
    277                 bp = null;
    278             } else {
    279                 setPrefix(tp);
    280                 assert (te != null && be != null);
    281                 compare(c, te.value, be.value);
    282                 resetPrefix();
    283                 te = be = null;
    284                 tp = bp = null;
    285             }
    286         }
    287     }
    288 
    289     private void compareContractions(int c, CharSequence p, int pidx, CharSequence q, int qidx) {
    290         // Parallel iteration over suffixes of both tables.
    291         CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator();
    292         CharsTrie.Iterator baseSuffixes = new CharsTrie(q, qidx).iterator();
    293         String ts = null; // Tailoring suffix.
    294         String bs = null; // Base suffix.
    295         // Use a string with two U+FFFF as the limit sentinel.
    296         // U+FFFF is untailorable and will not occur in contractions except maybe
    297         // as a single suffix character for a root-collator boundary contraction.
    298         String none = "\uffff\uffff";
    299         Entry te = null, be = null;
    300         for (;;) {
    301             if (ts == null) {
    302                 if (suffixes.hasNext()) {
    303                     te = suffixes.next();
    304                     ts = te.chars.toString();
    305                 } else {
    306                     te = null;
    307                     ts = none;
    308                 }
    309             }
    310             if (bs == null) {
    311                 if (baseSuffixes.hasNext()) {
    312                     be = baseSuffixes.next();
    313                     bs = be.chars.toString();
    314                 } else {
    315                     be = null;
    316                     bs = none;
    317                 }
    318             }
    319             if (Utility.sameObjects(ts, none) && Utility.sameObjects(bs, none)) {
    320                 break;
    321             }
    322             int cmp = ts.compareTo(bs);
    323             if (cmp < 0) {
    324                 // ts occurs in the tailoring but not in the base.
    325                 addSuffix(c, ts);
    326                 te = null;
    327                 ts = null;
    328             } else if (cmp > 0) {
    329                 // bs occurs in the base but not in the tailoring.
    330                 addSuffix(c, bs);
    331                 be = null;
    332                 bs = null;
    333             } else {
    334                 suffix = ts;
    335                 compare(c, te.value, be.value);
    336                 suffix = null;
    337                 te = be = null;
    338                 ts = bs = null;
    339             }
    340         }
    341     }
    342 
    343     private void addPrefixes(CollationData d, int c, CharSequence p, int pidx) {
    344         CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator();
    345         while (prefixes.hasNext()) {
    346             Entry e = prefixes.next();
    347             addPrefix(d, e.chars, c, e.value);
    348         }
    349     }
    350 
    351     private void addPrefix(CollationData d, CharSequence pfx, int c, int ce32) {
    352         setPrefix(pfx);
    353         ce32 = d.getFinalCE32(ce32);
    354         if (Collation.isContractionCE32(ce32)) {
    355             int idx = Collation.indexFromCE32(ce32);
    356             addContractions(c, d.contexts, idx + 2);
    357         }
    358         tailored.add(new StringBuilder(unreversedPrefix.appendCodePoint(c)));
    359         resetPrefix();
    360     }
    361 
    362     private void addContractions(int c, CharSequence p, int pidx) {
    363         CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator();
    364         while (suffixes.hasNext()) {
    365             Entry e = suffixes.next();
    366             addSuffix(c, e.chars);
    367         }
    368     }
    369 
    370     private void addSuffix(int c, CharSequence sfx) {
    371         tailored.add(new StringBuilder(unreversedPrefix).appendCodePoint(c).append(sfx));
    372     }
    373 
    374     private void add(int c) {
    375         if (unreversedPrefix.length() == 0 && suffix == null) {
    376             tailored.add(c);
    377         } else {
    378             StringBuilder s = new StringBuilder(unreversedPrefix);
    379             s.appendCodePoint(c);
    380             if (suffix != null) {
    381                 s.append(suffix);
    382             }
    383             tailored.add(s);
    384         }
    385     }
    386 
    387     // Prefixes are reversed in the data structure.
    388     private void setPrefix(CharSequence pfx) {
    389         unreversedPrefix.setLength(0);
    390         unreversedPrefix.append(pfx).reverse();
    391     }
    392 
    393     private void resetPrefix() {
    394         unreversedPrefix.setLength(0);
    395     }
    396 }
    397 
    398