1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2013-2014, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * ContractionsAndExpansions.java, ported from collationsets.h/.cpp 9 * 10 * C++ version created on: 2013feb09 11 * created by: Markus W. Scherer 12 */ 13 14 package com.ibm.icu.impl.coll; 15 16 import java.util.Iterator; 17 18 import com.ibm.icu.impl.Trie2; 19 import com.ibm.icu.text.UnicodeSet; 20 import com.ibm.icu.util.CharsTrie; 21 import com.ibm.icu.util.CharsTrie.Entry; 22 23 public final class ContractionsAndExpansions { 24 // C++: The following fields are @internal, only public for access by callback. 25 private CollationData data; 26 private UnicodeSet contractions; 27 private UnicodeSet expansions; 28 private CESink sink; 29 private boolean addPrefixes; 30 private int checkTailored = 0; // -1: collected tailored +1: exclude tailored 31 private UnicodeSet tailored = new UnicodeSet(); 32 private UnicodeSet ranges; 33 private StringBuilder unreversedPrefix = new StringBuilder(); 34 private String suffix; 35 private long[] ces = new long[Collation.MAX_EXPANSION_LENGTH]; 36 37 public static interface CESink { 38 void handleCE(long ce); 39 void handleExpansion(long ces[], int start, int length); 40 } 41 42 public ContractionsAndExpansions(UnicodeSet con, UnicodeSet exp, CESink s, boolean prefixes) { 43 contractions = con; 44 expansions = exp; 45 sink = s; 46 addPrefixes = prefixes; 47 } 48 49 public void forData(CollationData d) { 50 // Add all from the data, can be tailoring or base. 51 if (d.base != null) { 52 checkTailored = -1; 53 } 54 data = d; 55 Iterator<Trie2.Range> trieIterator = data.trie.iterator(); 56 Trie2.Range range; 57 while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) { 58 enumCnERange(range.startCodePoint, range.endCodePoint, range.value, this); 59 } 60 if (d.base == null) { 61 return; 62 } 63 // Add all from the base data but only for un-tailored code points. 64 tailored.freeze(); 65 checkTailored = 1; 66 data = d.base; 67 trieIterator = data.trie.iterator(); 68 while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) { 69 enumCnERange(range.startCodePoint, range.endCodePoint, range.value, this); 70 } 71 } 72 73 private void enumCnERange(int start, int end, int ce32, ContractionsAndExpansions cne) { 74 if (cne.checkTailored == 0) { 75 // There is no tailoring. 76 // No need to collect nor check the tailored set. 77 } else if (cne.checkTailored < 0) { 78 // Collect the set of code points with mappings in the tailoring data. 79 if (ce32 == Collation.FALLBACK_CE32) { 80 return; // fallback to base, not tailored 81 } else { 82 cne.tailored.add(start, end); 83 } 84 // checkTailored > 0: Exclude tailored ranges from the base data enumeration. 85 } else if (start == end) { 86 if (cne.tailored.contains(start)) { 87 return; 88 } 89 } else if (cne.tailored.containsSome(start, end)) { 90 if (cne.ranges == null) { 91 cne.ranges = new UnicodeSet(); 92 } 93 cne.ranges.set(start, end).removeAll(cne.tailored); 94 int count = cne.ranges.getRangeCount(); 95 for (int i = 0; i < count; ++i) { 96 cne.handleCE32(cne.ranges.getRangeStart(i), cne.ranges.getRangeEnd(i), ce32); 97 } 98 } 99 cne.handleCE32(start, end, ce32); 100 } 101 102 public void forCodePoint(CollationData d, int c) { 103 int ce32 = d.getCE32(c); 104 if (ce32 == Collation.FALLBACK_CE32) { 105 d = d.base; 106 ce32 = d.getCE32(c); 107 } 108 data = d; 109 handleCE32(c, c, ce32); 110 } 111 112 private void handleCE32(int start, int end, int ce32) { 113 for (;;) { 114 if ((ce32 & 0xff) < Collation.SPECIAL_CE32_LOW_BYTE) { 115 // !isSpecialCE32() 116 if (sink != null) { 117 sink.handleCE(Collation.ceFromSimpleCE32(ce32)); 118 } 119 return; 120 } 121 switch (Collation.tagFromCE32(ce32)) { 122 case Collation.FALLBACK_TAG: 123 return; 124 case Collation.RESERVED_TAG_3: 125 case Collation.BUILDER_DATA_TAG: 126 case Collation.LEAD_SURROGATE_TAG: 127 // Java porting note: U_INTERNAL_PROGRAM_ERROR is set to errorCode in ICU4C. 128 throw new AssertionError( 129 String.format("Unexpected CE32 tag type %d for ce32=0x%08x", 130 Collation.tagFromCE32(ce32), ce32)); 131 case Collation.LONG_PRIMARY_TAG: 132 if (sink != null) { 133 sink.handleCE(Collation.ceFromLongPrimaryCE32(ce32)); 134 } 135 return; 136 case Collation.LONG_SECONDARY_TAG: 137 if (sink != null) { 138 sink.handleCE(Collation.ceFromLongSecondaryCE32(ce32)); 139 } 140 return; 141 case Collation.LATIN_EXPANSION_TAG: 142 if (sink != null) { 143 ces[0] = Collation.latinCE0FromCE32(ce32); 144 ces[1] = Collation.latinCE1FromCE32(ce32); 145 sink.handleExpansion(ces, 0, 2); 146 } 147 // Optimization: If we have a prefix, 148 // then the relevant strings have been added already. 149 if (unreversedPrefix.length() == 0) { 150 addExpansions(start, end); 151 } 152 return; 153 case Collation.EXPANSION32_TAG: 154 if (sink != null) { 155 int idx = Collation.indexFromCE32(ce32); 156 int length = Collation.lengthFromCE32(ce32); 157 for (int i = 0; i < length; ++i) { 158 ces[i] = Collation.ceFromCE32(data.ce32s[idx + i]); 159 } 160 sink.handleExpansion(ces, 0, length); 161 } 162 // Optimization: If we have a prefix, 163 // then the relevant strings have been added already. 164 if (unreversedPrefix.length() == 0) { 165 addExpansions(start, end); 166 } 167 return; 168 case Collation.EXPANSION_TAG: 169 if (sink != null) { 170 int idx = Collation.indexFromCE32(ce32); 171 int length = Collation.lengthFromCE32(ce32); 172 sink.handleExpansion(data.ces, idx, length); 173 } 174 // Optimization: If we have a prefix, 175 // then the relevant strings have been added already. 176 if (unreversedPrefix.length() == 0) { 177 addExpansions(start, end); 178 } 179 return; 180 case Collation.PREFIX_TAG: 181 handlePrefixes(start, end, ce32); 182 return; 183 case Collation.CONTRACTION_TAG: 184 handleContractions(start, end, ce32); 185 return; 186 case Collation.DIGIT_TAG: 187 // Fetch the non-numeric-collation CE32 and continue. 188 ce32 = data.ce32s[Collation.indexFromCE32(ce32)]; 189 break; 190 case Collation.U0000_TAG: 191 assert (start == 0 && end == 0); 192 // Fetch the normal ce32 for U+0000 and continue. 193 ce32 = data.ce32s[0]; 194 break; 195 case Collation.HANGUL_TAG: 196 if (sink != null) { 197 // TODO: This should be optimized, 198 // especially if [start..end] is the complete Hangul range. (assert that) 199 UTF16CollationIterator iter = new UTF16CollationIterator(data); 200 StringBuilder hangul = new StringBuilder(1); 201 for (int c = start; c <= end; ++c) { 202 hangul.setLength(0); 203 hangul.appendCodePoint(c); 204 iter.setText(false, hangul, 0); 205 int length = iter.fetchCEs(); 206 // Ignore the terminating non-CE. 207 assert (length >= 2 && iter.getCE(length - 1) == Collation.NO_CE); 208 sink.handleExpansion(iter.getCEs(), 0, length - 1); 209 } 210 } 211 // Optimization: If we have a prefix, 212 // then the relevant strings have been added already. 213 if (unreversedPrefix.length() == 0) { 214 addExpansions(start, end); 215 } 216 return; 217 case Collation.OFFSET_TAG: 218 // Currently no need to send offset CEs to the sink. 219 return; 220 case Collation.IMPLICIT_TAG: 221 // Currently no need to send implicit CEs to the sink. 222 return; 223 } 224 } 225 } 226 227 private void handlePrefixes(int start, int end, int ce32) { 228 int index = Collation.indexFromCE32(ce32); 229 ce32 = data.getCE32FromContexts(index); // Default if no prefix match. 230 handleCE32(start, end, ce32); 231 if (!addPrefixes) { 232 return; 233 } 234 CharsTrie.Iterator prefixes = new CharsTrie(data.contexts, index + 2).iterator(); 235 while (prefixes.hasNext()) { 236 Entry e = prefixes.next(); 237 setPrefix(e.chars); 238 // Prefix/pre-context mappings are special kinds of contractions 239 // that always yield expansions. 240 addStrings(start, end, contractions); 241 addStrings(start, end, expansions); 242 handleCE32(start, end, e.value); 243 } 244 resetPrefix(); 245 } 246 247 void handleContractions(int start, int end, int ce32) { 248 int index = Collation.indexFromCE32(ce32); 249 if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 250 // No match on the single code point. 251 // We are underneath a prefix, and the default mapping is just 252 // a fallback to the mappings for a shorter prefix. 253 assert (unreversedPrefix.length() != 0); 254 } else { 255 ce32 = data.getCE32FromContexts(index); // Default if no suffix match. 256 assert (!Collation.isContractionCE32(ce32)); 257 handleCE32(start, end, ce32); 258 } 259 CharsTrie.Iterator suffixes = new CharsTrie(data.contexts, index + 2).iterator(); 260 while (suffixes.hasNext()) { 261 Entry e = suffixes.next(); 262 suffix = e.chars.toString(); 263 addStrings(start, end, contractions); 264 if (unreversedPrefix.length() != 0) { 265 addStrings(start, end, expansions); 266 } 267 handleCE32(start, end, e.value); 268 } 269 suffix = null; 270 } 271 272 void addExpansions(int start, int end) { 273 if (unreversedPrefix.length() == 0 && suffix == null) { 274 if (expansions != null) { 275 expansions.add(start, end); 276 } 277 } else { 278 addStrings(start, end, expansions); 279 } 280 } 281 282 void addStrings(int start, int end, UnicodeSet set) { 283 if (set == null) { 284 return; 285 } 286 StringBuilder s = new StringBuilder(unreversedPrefix); 287 do { 288 s.appendCodePoint(start); 289 if (suffix != null) { 290 s.append(suffix); 291 } 292 set.add(s); 293 s.setLength(unreversedPrefix.length()); 294 } while (++start <= end); 295 } 296 297 // Prefixes are reversed in the data structure. 298 private void setPrefix(CharSequence pfx) { 299 unreversedPrefix.setLength(0); 300 unreversedPrefix.append(pfx).reverse(); 301 } 302 303 private void resetPrefix() { 304 unreversedPrefix.setLength(0); 305 } 306 }