1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2013-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 ******************************************************************************* 9 * TailoredSet.java, ported from collationsets.h/.cpp 10 * 11 * C++ version created on: 2013feb09 12 * created by: Markus W. Scherer 13 */ 14 15 package android.icu.impl.coll; 16 17 import java.util.Iterator; 18 19 import android.icu.impl.Normalizer2Impl.Hangul; 20 import android.icu.impl.Trie2; 21 import android.icu.impl.Utility; 22 import android.icu.text.UnicodeSet; 23 import android.icu.util.CharsTrie; 24 import android.icu.util.CharsTrie.Entry; 25 26 /** 27 * Finds the set of characters and strings that sort differently in the tailoring 28 * from the base data. 29 * 30 * Every mapping in the tailoring needs to be compared to the base, 31 * because some mappings are copied for optimization, and 32 * all contractions for a character are copied if any contractions for that character 33 * are added, modified or removed. 34 * 35 * It might be simpler to re-parse the rule string, but: 36 * - That would require duplicating some of the from-rules builder code. 37 * - That would make the runtime code depend on the builder. 38 * - That would only work if we have the rule string, and we allow users to 39 * omit the rule string from data files. 40 * @hide Only a subset of ICU is exposed in Android 41 */ 42 public final class TailoredSet { 43 44 private CollationData data; 45 private CollationData baseData; 46 private UnicodeSet tailored; 47 private StringBuilder unreversedPrefix = new StringBuilder(); 48 private String suffix; 49 50 public TailoredSet(UnicodeSet t) { 51 tailored = t; 52 } 53 54 public void forData(CollationData d) { 55 data = d; 56 baseData = d.base; 57 assert (baseData != null); 58 // utrie2_enum(data->trie, NULL, enumTailoredRange, this); 59 Iterator<Trie2.Range> trieIterator = data.trie.iterator(); 60 Trie2.Range range; 61 while (trieIterator.hasNext() && !(range = trieIterator.next()).leadSurrogate) { 62 enumTailoredRange(range.startCodePoint, range.endCodePoint, range.value, this); 63 } 64 } 65 66 private void enumTailoredRange(int start, int end, int ce32, TailoredSet ts) { 67 if (ce32 == Collation.FALLBACK_CE32) { 68 return; // fallback to base, not tailored 69 } 70 ts.handleCE32(start, end, ce32); 71 } 72 73 // Java porting note: ICU4C returns U_SUCCESS(error) and it's not applicable to ICU4J. 74 // Also, ICU4C requires handleCE32() to be public because it is used by the callback 75 // function (enumTailoredRange()). This is not necessary for Java implementation. 76 private void handleCE32(int start, int end, int ce32) { 77 assert (ce32 != Collation.FALLBACK_CE32); 78 if (Collation.isSpecialCE32(ce32)) { 79 ce32 = data.getIndirectCE32(ce32); 80 if (ce32 == Collation.FALLBACK_CE32) { 81 return; 82 } 83 } 84 do { 85 int baseCE32 = baseData.getFinalCE32(baseData.getCE32(start)); 86 // Do not just continue if ce32 == baseCE32 because 87 // contractions and expansions in different data objects 88 // normally differ even if they have the same data offsets. 89 if (Collation.isSelfContainedCE32(ce32) && Collation.isSelfContainedCE32(baseCE32)) { 90 // fastpath 91 if (ce32 != baseCE32) { 92 tailored.add(start); 93 } 94 } else { 95 compare(start, ce32, baseCE32); 96 } 97 } while (++start <= end); 98 } 99 100 private void compare(int c, int ce32, int baseCE32) { 101 if (Collation.isPrefixCE32(ce32)) { 102 int dataIndex = Collation.indexFromCE32(ce32); 103 ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex)); 104 if (Collation.isPrefixCE32(baseCE32)) { 105 int baseIndex = Collation.indexFromCE32(baseCE32); 106 baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); 107 comparePrefixes(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2); 108 } else { 109 addPrefixes(data, c, data.contexts, dataIndex + 2); 110 } 111 } else if (Collation.isPrefixCE32(baseCE32)) { 112 int baseIndex = Collation.indexFromCE32(baseCE32); 113 baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); 114 addPrefixes(baseData, c, baseData.contexts, baseIndex + 2); 115 } 116 117 if (Collation.isContractionCE32(ce32)) { 118 int dataIndex = Collation.indexFromCE32(ce32); 119 if ((ce32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 120 ce32 = Collation.NO_CE32; 121 } else { 122 ce32 = data.getFinalCE32(data.getCE32FromContexts(dataIndex)); 123 } 124 if (Collation.isContractionCE32(baseCE32)) { 125 int baseIndex = Collation.indexFromCE32(baseCE32); 126 if ((baseCE32 & Collation.CONTRACT_SINGLE_CP_NO_MATCH) != 0) { 127 baseCE32 = Collation.NO_CE32; 128 } else { 129 baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); 130 } 131 compareContractions(c, data.contexts, dataIndex + 2, baseData.contexts, baseIndex + 2); 132 } else { 133 addContractions(c, data.contexts, dataIndex + 2); 134 } 135 } else if (Collation.isContractionCE32(baseCE32)) { 136 int baseIndex = Collation.indexFromCE32(baseCE32); 137 baseCE32 = baseData.getFinalCE32(baseData.getCE32FromContexts(baseIndex)); 138 addContractions(c, baseData.contexts, baseIndex + 2); 139 } 140 141 int tag; 142 if (Collation.isSpecialCE32(ce32)) { 143 tag = Collation.tagFromCE32(ce32); 144 assert (tag != Collation.PREFIX_TAG); 145 assert (tag != Collation.CONTRACTION_TAG); 146 // Currently, the tailoring data builder does not write offset tags. 147 // They might be useful for saving space, 148 // but they would complicate the builder, 149 // and in tailorings we assume that performance of tailored characters is more important. 150 assert (tag != Collation.OFFSET_TAG); 151 } else { 152 tag = -1; 153 } 154 int baseTag; 155 if (Collation.isSpecialCE32(baseCE32)) { 156 baseTag = Collation.tagFromCE32(baseCE32); 157 assert (baseTag != Collation.PREFIX_TAG); 158 assert (baseTag != Collation.CONTRACTION_TAG); 159 } else { 160 baseTag = -1; 161 } 162 163 // Non-contextual mappings, expansions, etc. 164 if (baseTag == Collation.OFFSET_TAG) { 165 // We might be comparing a tailoring CE which is a copy of 166 // a base offset-tag CE, via the [optimize [set]] syntax 167 // or when a single-character mapping was copied for tailored contractions. 168 // Offset tags always result in long-primary CEs, 169 // with common secondary/tertiary weights. 170 if (!Collation.isLongPrimaryCE32(ce32)) { 171 add(c); 172 return; 173 } 174 long dataCE = baseData.ces[Collation.indexFromCE32(baseCE32)]; 175 long p = Collation.getThreeBytePrimaryForOffsetData(c, dataCE); 176 if (Collation.primaryFromLongPrimaryCE32(ce32) != p) { 177 add(c); 178 return; 179 } 180 } 181 182 if (tag != baseTag) { 183 add(c); 184 return; 185 } 186 187 if (tag == Collation.EXPANSION32_TAG) { 188 int length = Collation.lengthFromCE32(ce32); 189 int baseLength = Collation.lengthFromCE32(baseCE32); 190 191 if (length != baseLength) { 192 add(c); 193 return; 194 } 195 196 int idx0 = Collation.indexFromCE32(ce32); 197 int idx1 = Collation.indexFromCE32(baseCE32); 198 199 for (int i = 0; i < length; ++i) { 200 if (data.ce32s[idx0 + i] != baseData.ce32s[idx1 + i]) { 201 add(c); 202 break; 203 } 204 } 205 } else if (tag == Collation.EXPANSION_TAG) { 206 int length = Collation.lengthFromCE32(ce32); 207 int baseLength = Collation.lengthFromCE32(baseCE32); 208 209 if (length != baseLength) { 210 add(c); 211 return; 212 } 213 214 int idx0 = Collation.indexFromCE32(ce32); 215 int idx1 = Collation.indexFromCE32(baseCE32); 216 217 for (int i = 0; i < length; ++i) { 218 if (data.ces[idx0 + i] != baseData.ces[idx1 + i]) { 219 add(c); 220 break; 221 } 222 } 223 } else if (tag == Collation.HANGUL_TAG) { 224 StringBuilder jamos = new StringBuilder(); 225 int length = Hangul.decompose(c, jamos); 226 if (tailored.contains(jamos.charAt(0)) || tailored.contains(jamos.charAt(1)) 227 || (length == 3 && tailored.contains(jamos.charAt(2)))) { 228 add(c); 229 } 230 } else if (ce32 != baseCE32) { 231 add(c); 232 } 233 } 234 235 private void comparePrefixes(int c, CharSequence p, int pidx, CharSequence q, int qidx) { 236 // Parallel iteration over prefixes of both tables. 237 CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator(); 238 CharsTrie.Iterator basePrefixes = new CharsTrie(q, qidx).iterator(); 239 String tp = null; // Tailoring prefix. 240 String bp = null; // Base prefix. 241 // Use a string with a U+FFFF as the limit sentinel. 242 // U+FFFF is untailorable and will not occur in prefixes. 243 String none = "\uffff"; 244 Entry te = null, be = null; 245 for (;;) { 246 if (tp == null) { 247 if (prefixes.hasNext()) { 248 te = prefixes.next(); 249 tp = te.chars.toString(); 250 } else { 251 te = null; 252 tp = none; 253 } 254 } 255 if (bp == null) { 256 if (basePrefixes.hasNext()) { 257 be = basePrefixes.next(); 258 bp = be.chars.toString(); 259 } else { 260 be = null; 261 bp = none; 262 } 263 } 264 if (Utility.sameObjects(tp, none) && Utility.sameObjects(bp, none)) { 265 break; 266 } 267 int cmp = tp.compareTo(bp); 268 if (cmp < 0) { 269 // tp occurs in the tailoring but not in the base. 270 assert (te != null); 271 addPrefix(data, tp, c, te.value); 272 te = null; 273 tp = null; 274 } else if (cmp > 0) { 275 // bp occurs in the base but not in the tailoring. 276 assert (be != null); 277 addPrefix(baseData, bp, c, be.value); 278 be = null; 279 bp = null; 280 } else { 281 setPrefix(tp); 282 assert (te != null && be != null); 283 compare(c, te.value, be.value); 284 resetPrefix(); 285 te = be = null; 286 tp = bp = null; 287 } 288 } 289 } 290 291 private void compareContractions(int c, CharSequence p, int pidx, CharSequence q, int qidx) { 292 // Parallel iteration over suffixes of both tables. 293 CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator(); 294 CharsTrie.Iterator baseSuffixes = new CharsTrie(q, qidx).iterator(); 295 String ts = null; // Tailoring suffix. 296 String bs = null; // Base suffix. 297 // Use a string with two U+FFFF as the limit sentinel. 298 // U+FFFF is untailorable and will not occur in contractions except maybe 299 // as a single suffix character for a root-collator boundary contraction. 300 String none = "\uffff\uffff"; 301 Entry te = null, be = null; 302 for (;;) { 303 if (ts == null) { 304 if (suffixes.hasNext()) { 305 te = suffixes.next(); 306 ts = te.chars.toString(); 307 } else { 308 te = null; 309 ts = none; 310 } 311 } 312 if (bs == null) { 313 if (baseSuffixes.hasNext()) { 314 be = baseSuffixes.next(); 315 bs = be.chars.toString(); 316 } else { 317 be = null; 318 bs = none; 319 } 320 } 321 if (Utility.sameObjects(ts, none) && Utility.sameObjects(bs, none)) { 322 break; 323 } 324 int cmp = ts.compareTo(bs); 325 if (cmp < 0) { 326 // ts occurs in the tailoring but not in the base. 327 addSuffix(c, ts); 328 te = null; 329 ts = null; 330 } else if (cmp > 0) { 331 // bs occurs in the base but not in the tailoring. 332 addSuffix(c, bs); 333 be = null; 334 bs = null; 335 } else { 336 suffix = ts; 337 compare(c, te.value, be.value); 338 suffix = null; 339 te = be = null; 340 ts = bs = null; 341 } 342 } 343 } 344 345 private void addPrefixes(CollationData d, int c, CharSequence p, int pidx) { 346 CharsTrie.Iterator prefixes = new CharsTrie(p, pidx).iterator(); 347 while (prefixes.hasNext()) { 348 Entry e = prefixes.next(); 349 addPrefix(d, e.chars, c, e.value); 350 } 351 } 352 353 private void addPrefix(CollationData d, CharSequence pfx, int c, int ce32) { 354 setPrefix(pfx); 355 ce32 = d.getFinalCE32(ce32); 356 if (Collation.isContractionCE32(ce32)) { 357 int idx = Collation.indexFromCE32(ce32); 358 addContractions(c, d.contexts, idx + 2); 359 } 360 tailored.add(new StringBuilder(unreversedPrefix.appendCodePoint(c))); 361 resetPrefix(); 362 } 363 364 private void addContractions(int c, CharSequence p, int pidx) { 365 CharsTrie.Iterator suffixes = new CharsTrie(p, pidx).iterator(); 366 while (suffixes.hasNext()) { 367 Entry e = suffixes.next(); 368 addSuffix(c, e.chars); 369 } 370 } 371 372 private void addSuffix(int c, CharSequence sfx) { 373 tailored.add(new StringBuilder(unreversedPrefix).appendCodePoint(c).append(sfx)); 374 } 375 376 private void add(int c) { 377 if (unreversedPrefix.length() == 0 && suffix == null) { 378 tailored.add(c); 379 } else { 380 StringBuilder s = new StringBuilder(unreversedPrefix); 381 s.appendCodePoint(c); 382 if (suffix != null) { 383 s.append(suffix); 384 } 385 tailored.add(s); 386 } 387 } 388 389 // Prefixes are reversed in the data structure. 390 private void setPrefix(CharSequence pfx) { 391 unreversedPrefix.setLength(0); 392 unreversedPrefix.append(pfx).reverse(); 393 } 394 395 private void resetPrefix() { 396 unreversedPrefix.setLength(0); 397 } 398 } 399 400