1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 1996-2014, International Business Machines Corporation and * 7 * others. All Rights Reserved. * 8 ******************************************************************************* 9 */ 10 package android.icu.text; 11 12 import java.util.ArrayList; 13 import java.util.HashSet; 14 import java.util.Iterator; 15 import java.util.List; 16 import java.util.Set; 17 18 import android.icu.impl.Norm2AllModes; 19 import android.icu.impl.Normalizer2Impl; 20 import android.icu.impl.Utility; 21 import android.icu.lang.UCharacter; 22 23 /** 24 * This class allows one to iterate through all the strings that are canonically equivalent to a given 25 * string. For example, here are some sample results: 26 * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA} 27 * <pre> 28 1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA} 29 2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE} 30 3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA} 31 4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE} 32 5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA} 33 6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE} 34 7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA} 35 8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE} 36 9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA} 37 10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE} 38 11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA} 39 12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE} 40 *</pre> 41 *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones, 42 * since it has not been optimized for that situation. 43 * @author M. Davis 44 * @hide Only a subset of ICU is exposed in Android 45 */ 46 47 public final class CanonicalIterator { 48 /** 49 * Construct a CanonicalIterator object 50 * @param source string to get results for 51 */ 52 public CanonicalIterator(String source) { 53 Norm2AllModes allModes = Norm2AllModes.getNFCInstance(); 54 nfd = allModes.decomp; 55 nfcImpl = allModes.impl.ensureCanonIterData(); 56 setSource(source); 57 } 58 59 /** 60 * Gets the NFD form of the current source we are iterating over. 61 * @return gets the source: NOTE: it is the NFD form of the source originally passed in 62 */ 63 public String getSource() { 64 return source; 65 } 66 67 /** 68 * Resets the iterator so that one can start again from the beginning. 69 */ 70 public void reset() { 71 done = false; 72 for (int i = 0; i < current.length; ++i) { 73 current[i] = 0; 74 } 75 } 76 77 /** 78 * Get the next canonically equivalent string. 79 * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b> 80 * @return the next string that is canonically equivalent. The value null is returned when 81 * the iteration is done. 82 */ 83 public String next() { 84 if (done) return null; 85 86 // construct return value 87 88 buffer.setLength(0); // delete old contents 89 for (int i = 0; i < pieces.length; ++i) { 90 buffer.append(pieces[i][current[i]]); 91 } 92 String result = buffer.toString(); 93 94 // find next value for next time 95 96 for (int i = current.length - 1; ; --i) { 97 if (i < 0) { 98 done = true; 99 break; 100 } 101 current[i]++; 102 if (current[i] < pieces[i].length) break; // got sequence 103 current[i] = 0; 104 } 105 return result; 106 } 107 108 /** 109 * Set a new source for this iterator. Allows object reuse. 110 * @param newSource the source string to iterate against. This allows the same iterator to be used 111 * while changing the source string, saving object creation. 112 */ 113 public void setSource(String newSource) { 114 source = nfd.normalize(newSource); 115 done = false; 116 117 // catch degenerate case 118 if (newSource.length() == 0) { 119 pieces = new String[1][]; 120 current = new int[1]; 121 pieces[0] = new String[]{""}; 122 return; 123 } 124 125 // find the segments 126 List<String> segmentList = new ArrayList<String>(); 127 int cp; 128 int start = 0; 129 130 // i should be the end of the first code point 131 // break up the string into segements 132 133 int i = UTF16.findOffsetFromCodePoint(source, 1); 134 135 for (; i < source.length(); i += Character.charCount(cp)) { 136 cp = source.codePointAt(i); 137 if (nfcImpl.isCanonSegmentStarter(cp)) { 138 segmentList.add(source.substring(start, i)); // add up to i 139 start = i; 140 } 141 } 142 segmentList.add(source.substring(start, i)); // add last one 143 144 // allocate the arrays, and find the strings that are CE to each segment 145 pieces = new String[segmentList.size()][]; 146 current = new int[segmentList.size()]; 147 for (i = 0; i < pieces.length; ++i) { 148 if (PROGRESS) System.out.println("SEGMENT"); 149 pieces[i] = getEquivalents(segmentList.get(i)); 150 } 151 } 152 153 /** 154 * Simple implementation of permutation. 155 * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b> 156 * @param source the string to find permutations for 157 * @param skipZeros set to true to skip characters with canonical combining class zero 158 * @param output the set to add the results to 159 * @deprecated This API is ICU internal only. 160 * @hide draft / provisional / internal are hidden on Android 161 */ 162 @Deprecated 163 public static void permute(String source, boolean skipZeros, Set<String> output) { 164 // TODO: optimize 165 //if (PROGRESS) System.out.println("Permute: " + source); 166 167 // optimization: 168 // if zero or one character, just return a set with it 169 // we check for length < 2 to keep from counting code points all the time 170 if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) { 171 output.add(source); 172 return; 173 } 174 175 // otherwise iterate through the string, and recursively permute all the other characters 176 Set<String> subpermute = new HashSet<String>(); 177 int cp; 178 for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { 179 cp = UTF16.charAt(source, i); 180 181 // optimization: 182 // if the character is canonical combining class zero, 183 // don't permute it 184 if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) { 185 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); 186 continue; 187 } 188 189 // see what the permutations of the characters before and after this one are 190 subpermute.clear(); 191 permute(source.substring(0,i) 192 + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute); 193 194 // prefix this character to all of them 195 String chStr = UTF16.valueOf(source, i); 196 for (String s : subpermute) { 197 String piece = chStr + s; 198 //if (PROGRESS) System.out.println(" Piece: " + piece); 199 output.add(piece); 200 } 201 } 202 } 203 204 // FOR TESTING 205 206 /* 207 *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition. 208 * 209 public static UnicodeSet getSafeStart() { 210 return (UnicodeSet) SAFE_START.clone(); 211 } 212 */ 213 /* 214 *@return the set of characters whose decompositions start with the given character 215 * 216 public static UnicodeSet getStarts(int cp) { 217 UnicodeSet result = AT_START.get(cp); 218 if (result == null) result = EMPTY; 219 return (UnicodeSet) result.clone(); 220 } 221 */ 222 223 // ===================== PRIVATES ============================== 224 225 // debug 226 private static boolean PROGRESS = false; // debug progress 227 //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null; 228 private static boolean SKIP_ZEROS = true; 229 230 // fields 231 private final Normalizer2 nfd; 232 private final Normalizer2Impl nfcImpl; 233 private String source; 234 private boolean done; 235 private String[][] pieces; 236 private int[] current; 237 // Note: C will need two more fields, since arrays there don't have lengths 238 // int pieces_length; 239 // int[] pieces_lengths; 240 241 // transient fields 242 private transient StringBuilder buffer = new StringBuilder(); 243 244 245 // we have a segment, in NFD. Find all the strings that are canonically equivalent to it. 246 private String[] getEquivalents(String segment) { 247 Set<String> result = new HashSet<String>(); 248 Set<String> basic = getEquivalents2(segment); 249 Set<String> permutations = new HashSet<String>(); 250 251 // now get all the permutations 252 // add only the ones that are canonically equivalent 253 // TODO: optimize by not permuting any class zero. 254 Iterator<String> it = basic.iterator(); 255 while (it.hasNext()) { 256 String item = it.next(); 257 permutations.clear(); 258 permute(item, SKIP_ZEROS, permutations); 259 Iterator<String> it2 = permutations.iterator(); 260 while (it2.hasNext()) { 261 String possible = it2.next(); 262 263 /* 264 String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0); 265 if (attempt.equals(segment)) { 266 */ 267 if (Normalizer.compare(possible, segment,0)==0) { 268 269 if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible)); 270 result.add(possible); 271 272 } else { 273 if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible)); 274 } 275 } 276 } 277 278 // convert into a String[] to clean up storage 279 String[] finalResult = new String[result.size()]; 280 result.toArray(finalResult); 281 return finalResult; 282 } 283 284 285 private Set<String> getEquivalents2(String segment) { 286 287 Set<String> result = new HashSet<String>(); 288 289 if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment)); 290 291 result.add(segment); 292 StringBuffer workingBuffer = new StringBuffer(); 293 UnicodeSet starts = new UnicodeSet(); 294 295 // cycle through all the characters 296 int cp; 297 for (int i = 0; i < segment.length(); i += Character.charCount(cp)) { 298 299 // see if any character is at the start of some decomposition 300 cp = segment.codePointAt(i); 301 if (!nfcImpl.getCanonStartSet(cp, starts)) { 302 continue; 303 } 304 // if so, see which decompositions match 305 for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) { 306 int cp2 = iter.codepoint; 307 Set<String> remainder = extract(cp2, segment, i, workingBuffer); 308 if (remainder == null) { 309 continue; 310 } 311 312 // there were some matches, so add all the possibilities to the set. 313 String prefix= segment.substring(0,i); 314 prefix += UTF16.valueOf(cp2); 315 for (String item : remainder) { 316 result.add(prefix + item); 317 } 318 } 319 } 320 return result; 321 /* 322 Set result = new HashSet(); 323 if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment)); 324 result.add(segment); 325 StringBuffer workingBuffer = new StringBuffer(); 326 327 // cycle through all the characters 328 int cp; 329 330 for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) { 331 // see if any character is at the start of some decomposition 332 cp = UTF16.charAt(segment, i); 333 NormalizerImpl.getCanonStartSet(c,fillSet) 334 UnicodeSet starts = AT_START.get(cp); 335 if (starts == null) continue; 336 UnicodeSetIterator usi = new UnicodeSetIterator(starts); 337 // if so, see which decompositions match 338 while (usi.next()) { 339 int cp2 = usi.codepoint; 340 // we know that there are no strings in it 341 // so we don't have to check CharacterIterator.IS_STRING 342 Set remainder = extract(cp2, segment, i, workingBuffer); 343 if (remainder == null) continue; 344 345 // there were some matches, so add all the possibilities to the set. 346 String prefix = segment.substring(0, i) + UTF16.valueOf(cp2); 347 Iterator it = remainder.iterator(); 348 while (it.hasNext()) { 349 String item = (String) it.next(); 350 if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item)); 351 result.add(prefix + item); 352 } 353 } 354 } 355 return result; 356 */ 357 } 358 359 /** 360 * See if the decomposition of cp2 is at segment starting at segmentPos 361 * (with canonical rearrangment!) 362 * If so, take the remainder, and return the equivalents 363 */ 364 private Set<String> extract(int comp, String segment, int segmentPos, StringBuffer buf) { 365 if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp)) 366 + ", " + Utility.hex(segment.substring(segmentPos))); 367 368 String decomp = nfcImpl.getDecomposition(comp); 369 if (decomp == null) { 370 decomp = UTF16.valueOf(comp); 371 } 372 373 // See if it matches the start of segment (at segmentPos) 374 boolean ok = false; 375 int cp; 376 int decompPos = 0; 377 int decompCp = UTF16.charAt(decomp,0); 378 decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char 379 //int decompClass = getClass(decompCp); 380 buf.setLength(0); // initialize working buffer, shared among callees 381 382 for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) { 383 cp = UTF16.charAt(segment, i); 384 if (cp == decompCp) { // if equal, eat another cp from decomp 385 if (PROGRESS) System.out.println(" matches: " + Utility.hex(UTF16.valueOf(cp))); 386 if (decompPos == decomp.length()) { // done, have all decomp characters! 387 buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars 388 ok = true; 389 break; 390 } 391 decompCp = UTF16.charAt(decomp, decompPos); 392 decompPos += UTF16.getCharCount(decompCp); 393 //decompClass = getClass(decompCp); 394 } else { 395 if (PROGRESS) System.out.println(" buffer: " + Utility.hex(UTF16.valueOf(cp))); 396 // brute force approach 397 UTF16.append(buf, cp); 398 /* TODO: optimize 399 // since we know that the classes are monotonically increasing, after zero 400 // e.g. 0 5 7 9 0 3 401 // we can do an optimization 402 // there are only a few cases that work: zero, less, same, greater 403 // if both classes are the same, we fail 404 // if the decomp class < the segment class, we fail 405 406 segClass = getClass(cp); 407 if (decompClass <= segClass) return null; 408 */ 409 } 410 } 411 if (!ok) return null; // we failed, characters left over 412 if (PROGRESS) System.out.println("Matches"); 413 if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder 414 String remainder = buf.toString(); 415 416 // brute force approach 417 // to check to make sure result is canonically equivalent 418 /* 419 String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0); 420 if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null; 421 */ 422 423 if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null; 424 425 // get the remaining combinations 426 return getEquivalents2(remainder); 427 } 428 429 /* 430 // TODO: fix once we have a codepoint interface to get the canonical combining class 431 // TODO: Need public access to canonical combining class in UCharacter! 432 private static int getClass(int cp) { 433 return Normalizer.getClass((char)cp); 434 } 435 */ 436 437 // ================= BUILDER ========================= 438 // TODO: Flatten this data so it doesn't have to be reconstructed each time! 439 440 //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change 441 private static final Set<String> SET_WITH_NULL_STRING = new HashSet<String>(); // constant, don't change 442 static { 443 SET_WITH_NULL_STRING.add(""); 444 } 445 446 // private static UnicodeSet SAFE_START = new UnicodeSet(); 447 // private static CharMap AT_START = new CharMap(); 448 449 // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!; 450 // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded. 451 // private static int LAST_UNICODE = 0x10FFFF; 452 /* 453 static { 454 buildData(); 455 } 456 */ 457 /* 458 private static void buildData() { 459 460 if (PROGRESS) System.out.println("Getting Safe Start"); 461 for (int cp = 0; cp <= LAST_UNICODE; ++cp) { 462 if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.'); 463 int cc = UCharacter.getCombiningClass(cp); 464 if (cc == 0) SAFE_START.add(cp); 465 // will fix to be really safe below 466 } 467 if (PROGRESS) System.out.println(); 468 469 if (PROGRESS) System.out.println("Getting Containment"); 470 for (int cp = 0; cp <= LAST_UNICODE; ++cp) { 471 if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.'); 472 473 if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue; 474 475 //String istr = UTF16.valueOf(cp); 476 String decomp = Normalizer.normalize(cp, Normalizer.NFD); 477 //if (decomp.equals(istr)) continue; 478 479 // add each character in the decomposition to canBeIn 480 481 int component; 482 for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) { 483 component = UTF16.charAt(decomp, i); 484 if (i == 0) { 485 AT_START.add(component, cp); 486 } else if (UCharacter.getCombiningClass(component) == 0) { 487 SAFE_START.remove(component); 488 } 489 } 490 } 491 if (PROGRESS) System.out.println(); 492 } 493 // the following is just for a map from characters to a set of characters 494 495 private static class CharMap { 496 Map storage = new HashMap(); 497 MutableInt probe = new MutableInt(); 498 boolean converted = false; 499 500 public void add(int cp, int whatItIsIn) { 501 UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp)); 502 if (result == null) { 503 result = new UnicodeSet(); 504 storage.put(probe, result); 505 } 506 result.add(whatItIsIn); 507 } 508 509 public UnicodeSet get(int cp) { 510 return (UnicodeSet) storage.get(probe.set(cp)); 511 } 512 } 513 514 private static class MutableInt { 515 public int contents; 516 public int hashCode() { return contents; } 517 public boolean equals(Object other) { 518 return ((MutableInt)other).contents == contents; 519 } 520 // allows chaining 521 public MutableInt set(int contents) { 522 this.contents = contents; 523 return this; 524 } 525 } 526 */ 527 528 } 529