1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2014, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.text; 10 11 import java.util.ArrayList; 12 import java.util.HashSet; 13 import java.util.Iterator; 14 import java.util.List; 15 import java.util.Set; 16 17 import com.ibm.icu.impl.Norm2AllModes; 18 import com.ibm.icu.impl.Normalizer2Impl; 19 import com.ibm.icu.impl.Utility; 20 import com.ibm.icu.lang.UCharacter; 21 22 /** 23 * This class allows one to iterate through all the strings that are canonically equivalent to a given 24 * string. For example, here are some sample results: 25 * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA} 26 * <pre> 27 1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA} 28 2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE} 29 3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA} 30 4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE} 31 5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA} 32 6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE} 33 7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA} 34 8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE} 35 9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA} 36 10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE} 37 11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA} 38 12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE} 39 *</pre> 40 *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones, 41 * since it has not been optimized for that situation. 42 * @author M. Davis 43 * @stable ICU 2.4 44 */ 45 46 public final class CanonicalIterator { 47 /** 48 * Construct a CanonicalIterator object 49 * @param source string to get results for 50 * @stable ICU 2.4 51 */ 52 public CanonicalIterator(String source) { 53 Norm2AllModes allModes = Norm2AllModes.getNFCInstance(); 54 nfd = allModes.decomp; 55 nfcImpl = allModes.impl.ensureCanonIterData(); 56 setSource(source); 57 } 58 59 /** 60 * Gets the NFD form of the current source we are iterating over. 61 * @return gets the source: NOTE: it is the NFD form of the source originally passed in 62 * @stable ICU 2.4 63 */ 64 public String getSource() { 65 return source; 66 } 67 68 /** 69 * Resets the iterator so that one can start again from the beginning. 70 * @stable ICU 2.4 71 */ 72 public void reset() { 73 done = false; 74 for (int i = 0; i < current.length; ++i) { 75 current[i] = 0; 76 } 77 } 78 79 /** 80 * Get the next canonically equivalent string. 81 * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b> 82 * @return the next string that is canonically equivalent. The value null is returned when 83 * the iteration is done. 84 * @stable ICU 2.4 85 */ 86 public String next() { 87 if (done) return null; 88 89 // construct return value 90 91 buffer.setLength(0); // delete old contents 92 for (int i = 0; i < pieces.length; ++i) { 93 buffer.append(pieces[i][current[i]]); 94 } 95 String result = buffer.toString(); 96 97 // find next value for next time 98 99 for (int i = current.length - 1; ; --i) { 100 if (i < 0) { 101 done = true; 102 break; 103 } 104 current[i]++; 105 if (current[i] < pieces[i].length) break; // got sequence 106 current[i] = 0; 107 } 108 return result; 109 } 110 111 /** 112 * Set a new source for this iterator. Allows object reuse. 113 * @param newSource the source string to iterate against. This allows the same iterator to be used 114 * while changing the source string, saving object creation. 115 * @stable ICU 2.4 116 */ 117 public void setSource(String newSource) { 118 source = nfd.normalize(newSource); 119 done = false; 120 121 // catch degenerate case 122 if (newSource.length() == 0) { 123 pieces = new String[1][]; 124 current = new int[1]; 125 pieces[0] = new String[]{""}; 126 return; 127 } 128 129 // find the segments 130 List<String> segmentList = new ArrayList<String>(); 131 int cp; 132 int start = 0; 133 134 // i should be the end of the first code point 135 // break up the string into segements 136 137 int i = UTF16.findOffsetFromCodePoint(source, 1); 138 139 for (; i < source.length(); i += Character.charCount(cp)) { 140 cp = source.codePointAt(i); 141 if (nfcImpl.isCanonSegmentStarter(cp)) { 142 segmentList.add(source.substring(start, i)); // add up to i 143 start = i; 144 } 145 } 146 segmentList.add(source.substring(start, i)); // add last one 147 148 // allocate the arrays, and find the strings that are CE to each segment 149 pieces = new String[segmentList.size()][]; 150 current = new int[segmentList.size()]; 151 for (i = 0; i < pieces.length; ++i) { 152 if (PROGRESS) System.out.println("SEGMENT"); 153 pieces[i] = getEquivalents(segmentList.get(i)); 154 } 155 } 156 157 /** 158 * Simple implementation of permutation. 159 * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b> 160 * @param source the string to find permutations for 161 * @param skipZeros set to true to skip characters with canonical combining class zero 162 * @param output the set to add the results to 163 * @internal 164 * @deprecated This API is ICU internal only. 165 */ 166 @Deprecated 167 public static void permute(String source, boolean skipZeros, Set<String> output) { 168 // TODO: optimize 169 //if (PROGRESS) System.out.println("Permute: " + source); 170 171 // optimization: 172 // if zero or one character, just return a set with it 173 // we check for length < 2 to keep from counting code points all the time 174 if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) { 175 output.add(source); 176 return; 177 } 178 179 // otherwise iterate through the string, and recursively permute all the other characters 180 Set<String> subpermute = new HashSet<String>(); 181 int cp; 182 for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { 183 cp = UTF16.charAt(source, i); 184 185 // optimization: 186 // if the character is canonical combining class zero, 187 // don't permute it 188 if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) { 189 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); 190 continue; 191 } 192 193 // see what the permutations of the characters before and after this one are 194 subpermute.clear(); 195 permute(source.substring(0,i) 196 + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute); 197 198 // prefix this character to all of them 199 String chStr = UTF16.valueOf(source, i); 200 for (String s : subpermute) { 201 String piece = chStr + s; 202 //if (PROGRESS) System.out.println(" Piece: " + piece); 203 output.add(piece); 204 } 205 } 206 } 207 208 // FOR TESTING 209 210 /* 211 *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition. 212 * 213 public static UnicodeSet getSafeStart() { 214 return (UnicodeSet) SAFE_START.clone(); 215 } 216 */ 217 /* 218 *@return the set of characters whose decompositions start with the given character 219 * 220 public static UnicodeSet getStarts(int cp) { 221 UnicodeSet result = AT_START.get(cp); 222 if (result == null) result = EMPTY; 223 return (UnicodeSet) result.clone(); 224 } 225 */ 226 227 // ===================== PRIVATES ============================== 228 229 // debug 230 private static boolean PROGRESS = false; // debug progress 231 //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null; 232 private static boolean SKIP_ZEROS = true; 233 234 // fields 235 private final Normalizer2 nfd; 236 private final Normalizer2Impl nfcImpl; 237 private String source; 238 private boolean done; 239 private String[][] pieces; 240 private int[] current; 241 // Note: C will need two more fields, since arrays there don't have lengths 242 // int pieces_length; 243 // int[] pieces_lengths; 244 245 // transient fields 246 private transient StringBuilder buffer = new StringBuilder(); 247 248 249 // we have a segment, in NFD. Find all the strings that are canonically equivalent to it. 250 private String[] getEquivalents(String segment) { 251 Set<String> result = new HashSet<String>(); 252 Set<String> basic = getEquivalents2(segment); 253 Set<String> permutations = new HashSet<String>(); 254 255 // now get all the permutations 256 // add only the ones that are canonically equivalent 257 // TODO: optimize by not permuting any class zero. 258 Iterator<String> it = basic.iterator(); 259 while (it.hasNext()) { 260 String item = it.next(); 261 permutations.clear(); 262 permute(item, SKIP_ZEROS, permutations); 263 Iterator<String> it2 = permutations.iterator(); 264 while (it2.hasNext()) { 265 String possible = it2.next(); 266 267 /* 268 String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0); 269 if (attempt.equals(segment)) { 270 */ 271 if (Normalizer.compare(possible, segment,0)==0) { 272 273 if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible)); 274 result.add(possible); 275 276 } else { 277 if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible)); 278 } 279 } 280 } 281 282 // convert into a String[] to clean up storage 283 String[] finalResult = new String[result.size()]; 284 result.toArray(finalResult); 285 return finalResult; 286 } 287 288 289 private Set<String> getEquivalents2(String segment) { 290 291 Set<String> result = new HashSet<String>(); 292 293 if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment)); 294 295 result.add(segment); 296 StringBuffer workingBuffer = new StringBuffer(); 297 UnicodeSet starts = new UnicodeSet(); 298 299 // cycle through all the characters 300 int cp; 301 for (int i = 0; i < segment.length(); i += Character.charCount(cp)) { 302 303 // see if any character is at the start of some decomposition 304 cp = segment.codePointAt(i); 305 if (!nfcImpl.getCanonStartSet(cp, starts)) { 306 continue; 307 } 308 // if so, see which decompositions match 309 for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) { 310 int cp2 = iter.codepoint; 311 Set<String> remainder = extract(cp2, segment, i, workingBuffer); 312 if (remainder == null) { 313 continue; 314 } 315 316 // there were some matches, so add all the possibilities to the set. 317 String prefix= segment.substring(0,i); 318 prefix += UTF16.valueOf(cp2); 319 for (String item : remainder) { 320 result.add(prefix + item); 321 } 322 } 323 } 324 return result; 325 /* 326 Set result = new HashSet(); 327 if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment)); 328 result.add(segment); 329 StringBuffer workingBuffer = new StringBuffer(); 330 331 // cycle through all the characters 332 int cp; 333 334 for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) { 335 // see if any character is at the start of some decomposition 336 cp = UTF16.charAt(segment, i); 337 NormalizerImpl.getCanonStartSet(c,fillSet) 338 UnicodeSet starts = AT_START.get(cp); 339 if (starts == null) continue; 340 UnicodeSetIterator usi = new UnicodeSetIterator(starts); 341 // if so, see which decompositions match 342 while (usi.next()) { 343 int cp2 = usi.codepoint; 344 // we know that there are no strings in it 345 // so we don't have to check CharacterIterator.IS_STRING 346 Set remainder = extract(cp2, segment, i, workingBuffer); 347 if (remainder == null) continue; 348 349 // there were some matches, so add all the possibilities to the set. 350 String prefix = segment.substring(0, i) + UTF16.valueOf(cp2); 351 Iterator it = remainder.iterator(); 352 while (it.hasNext()) { 353 String item = (String) it.next(); 354 if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item)); 355 result.add(prefix + item); 356 } 357 } 358 } 359 return result; 360 */ 361 } 362 363 /** 364 * See if the decomposition of cp2 is at segment starting at segmentPos 365 * (with canonical rearrangment!) 366 * If so, take the remainder, and return the equivalents 367 */ 368 private Set<String> extract(int comp, String segment, int segmentPos, StringBuffer buf) { 369 if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp)) 370 + ", " + Utility.hex(segment.substring(segmentPos))); 371 372 String decomp = nfcImpl.getDecomposition(comp); 373 if (decomp == null) { 374 decomp = UTF16.valueOf(comp); 375 } 376 377 // See if it matches the start of segment (at segmentPos) 378 boolean ok = false; 379 int cp; 380 int decompPos = 0; 381 int decompCp = UTF16.charAt(decomp,0); 382 decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char 383 //int decompClass = getClass(decompCp); 384 buf.setLength(0); // initialize working buffer, shared among callees 385 386 for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) { 387 cp = UTF16.charAt(segment, i); 388 if (cp == decompCp) { // if equal, eat another cp from decomp 389 if (PROGRESS) System.out.println(" matches: " + Utility.hex(UTF16.valueOf(cp))); 390 if (decompPos == decomp.length()) { // done, have all decomp characters! 391 buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars 392 ok = true; 393 break; 394 } 395 decompCp = UTF16.charAt(decomp, decompPos); 396 decompPos += UTF16.getCharCount(decompCp); 397 //decompClass = getClass(decompCp); 398 } else { 399 if (PROGRESS) System.out.println(" buffer: " + Utility.hex(UTF16.valueOf(cp))); 400 // brute force approach 401 UTF16.append(buf, cp); 402 /* TODO: optimize 403 // since we know that the classes are monotonically increasing, after zero 404 // e.g. 0 5 7 9 0 3 405 // we can do an optimization 406 // there are only a few cases that work: zero, less, same, greater 407 // if both classes are the same, we fail 408 // if the decomp class < the segment class, we fail 409 410 segClass = getClass(cp); 411 if (decompClass <= segClass) return null; 412 */ 413 } 414 } 415 if (!ok) return null; // we failed, characters left over 416 if (PROGRESS) System.out.println("Matches"); 417 if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder 418 String remainder = buf.toString(); 419 420 // brute force approach 421 // to check to make sure result is canonically equivalent 422 /* 423 String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0); 424 if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null; 425 */ 426 427 if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null; 428 429 // get the remaining combinations 430 return getEquivalents2(remainder); 431 } 432 433 /* 434 // TODO: fix once we have a codepoint interface to get the canonical combining class 435 // TODO: Need public access to canonical combining class in UCharacter! 436 private static int getClass(int cp) { 437 return Normalizer.getClass((char)cp); 438 } 439 */ 440 441 // ================= BUILDER ========================= 442 // TODO: Flatten this data so it doesn't have to be reconstructed each time! 443 444 //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change 445 private static final Set<String> SET_WITH_NULL_STRING = new HashSet<String>(); // constant, don't change 446 static { 447 SET_WITH_NULL_STRING.add(""); 448 } 449 450 // private static UnicodeSet SAFE_START = new UnicodeSet(); 451 // private static CharMap AT_START = new CharMap(); 452 453 // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!; 454 // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded. 455 // private static int LAST_UNICODE = 0x10FFFF; 456 /* 457 static { 458 buildData(); 459 } 460 */ 461 /* 462 private static void buildData() { 463 464 if (PROGRESS) System.out.println("Getting Safe Start"); 465 for (int cp = 0; cp <= LAST_UNICODE; ++cp) { 466 if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.'); 467 int cc = UCharacter.getCombiningClass(cp); 468 if (cc == 0) SAFE_START.add(cp); 469 // will fix to be really safe below 470 } 471 if (PROGRESS) System.out.println(); 472 473 if (PROGRESS) System.out.println("Getting Containment"); 474 for (int cp = 0; cp <= LAST_UNICODE; ++cp) { 475 if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.'); 476 477 if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue; 478 479 //String istr = UTF16.valueOf(cp); 480 String decomp = Normalizer.normalize(cp, Normalizer.NFD); 481 //if (decomp.equals(istr)) continue; 482 483 // add each character in the decomposition to canBeIn 484 485 int component; 486 for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) { 487 component = UTF16.charAt(decomp, i); 488 if (i == 0) { 489 AT_START.add(component, cp); 490 } else if (UCharacter.getCombiningClass(component) == 0) { 491 SAFE_START.remove(component); 492 } 493 } 494 } 495 if (PROGRESS) System.out.println(); 496 } 497 // the following is just for a map from characters to a set of characters 498 499 private static class CharMap { 500 Map storage = new HashMap(); 501 MutableInt probe = new MutableInt(); 502 boolean converted = false; 503 504 public void add(int cp, int whatItIsIn) { 505 UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp)); 506 if (result == null) { 507 result = new UnicodeSet(); 508 storage.put(probe, result); 509 } 510 result.add(whatItIsIn); 511 } 512 513 public UnicodeSet get(int cp) { 514 return (UnicodeSet) storage.get(probe.set(cp)); 515 } 516 } 517 518 private static class MutableInt { 519 public int contents; 520 public int hashCode() { return contents; } 521 public boolean equals(Object other) { 522 return ((MutableInt)other).contents == contents; 523 } 524 // allows chaining 525 public MutableInt set(int contents) { 526 this.contents = contents; 527 return this; 528 } 529 } 530 */ 531 532 } 533