1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2014, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 package com.ibm.icu.text; 8 9 import java.util.ArrayList; 10 import java.util.HashSet; 11 import java.util.Iterator; 12 import java.util.List; 13 import java.util.Set; 14 15 import com.ibm.icu.impl.Norm2AllModes; 16 import com.ibm.icu.impl.Normalizer2Impl; 17 import com.ibm.icu.impl.Utility; 18 import com.ibm.icu.lang.UCharacter; 19 20 /** 21 * This class allows one to iterate through all the strings that are canonically equivalent to a given 22 * string. For example, here are some sample results: 23 * Results for: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA} 24 * <pre> 25 1: {A}{RING ABOVE}{d}{DOT ABOVE}{CEDILLA} 26 2: {A}{RING ABOVE}{d}{CEDILLA}{DOT ABOVE} 27 3: {A}{RING ABOVE}{d WITH DOT ABOVE}{CEDILLA} 28 4: {A}{RING ABOVE}{d WITH CEDILLA}{DOT ABOVE} 29 5: {A WITH RING ABOVE}{d}{DOT ABOVE}{CEDILLA} 30 6: {A WITH RING ABOVE}{d}{CEDILLA}{DOT ABOVE} 31 7: {A WITH RING ABOVE}{d WITH DOT ABOVE}{CEDILLA} 32 8: {A WITH RING ABOVE}{d WITH CEDILLA}{DOT ABOVE} 33 9: {ANGSTROM SIGN}{d}{DOT ABOVE}{CEDILLA} 34 10: {ANGSTROM SIGN}{d}{CEDILLA}{DOT ABOVE} 35 11: {ANGSTROM SIGN}{d WITH DOT ABOVE}{CEDILLA} 36 12: {ANGSTROM SIGN}{d WITH CEDILLA}{DOT ABOVE} 37 *</pre> 38 *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones, 39 * since it has not been optimized for that situation. 40 * @author M. Davis 41 * @stable ICU 2.4 42 */ 43 44 public final class CanonicalIterator { 45 /** 46 * Construct a CanonicalIterator object 47 * @param source string to get results for 48 * @stable ICU 2.4 49 */ 50 public CanonicalIterator(String source) { 51 Norm2AllModes allModes = Norm2AllModes.getNFCInstance(); 52 nfd = allModes.decomp; 53 nfcImpl = allModes.impl.ensureCanonIterData(); 54 setSource(source); 55 } 56 57 /** 58 * Gets the NFD form of the current source we are iterating over. 59 * @return gets the source: NOTE: it is the NFD form of the source originally passed in 60 * @stable ICU 2.4 61 */ 62 public String getSource() { 63 return source; 64 } 65 66 /** 67 * Resets the iterator so that one can start again from the beginning. 68 * @stable ICU 2.4 69 */ 70 public void reset() { 71 done = false; 72 for (int i = 0; i < current.length; ++i) { 73 current[i] = 0; 74 } 75 } 76 77 /** 78 * Get the next canonically equivalent string. 79 * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b> 80 * @return the next string that is canonically equivalent. The value null is returned when 81 * the iteration is done. 82 * @stable ICU 2.4 83 */ 84 public String next() { 85 if (done) return null; 86 87 // construct return value 88 89 buffer.setLength(0); // delete old contents 90 for (int i = 0; i < pieces.length; ++i) { 91 buffer.append(pieces[i][current[i]]); 92 } 93 String result = buffer.toString(); 94 95 // find next value for next time 96 97 for (int i = current.length - 1; ; --i) { 98 if (i < 0) { 99 done = true; 100 break; 101 } 102 current[i]++; 103 if (current[i] < pieces[i].length) break; // got sequence 104 current[i] = 0; 105 } 106 return result; 107 } 108 109 /** 110 * Set a new source for this iterator. Allows object reuse. 111 * @param newSource the source string to iterate against. This allows the same iterator to be used 112 * while changing the source string, saving object creation. 113 * @stable ICU 2.4 114 */ 115 public void setSource(String newSource) { 116 source = nfd.normalize(newSource); 117 done = false; 118 119 // catch degenerate case 120 if (newSource.length() == 0) { 121 pieces = new String[1][]; 122 current = new int[1]; 123 pieces[0] = new String[]{""}; 124 return; 125 } 126 127 // find the segments 128 List<String> segmentList = new ArrayList<String>(); 129 int cp; 130 int start = 0; 131 132 // i should be the end of the first code point 133 // break up the string into segements 134 135 int i = UTF16.findOffsetFromCodePoint(source, 1); 136 137 for (; i < source.length(); i += Character.charCount(cp)) { 138 cp = source.codePointAt(i); 139 if (nfcImpl.isCanonSegmentStarter(cp)) { 140 segmentList.add(source.substring(start, i)); // add up to i 141 start = i; 142 } 143 } 144 segmentList.add(source.substring(start, i)); // add last one 145 146 // allocate the arrays, and find the strings that are CE to each segment 147 pieces = new String[segmentList.size()][]; 148 current = new int[segmentList.size()]; 149 for (i = 0; i < pieces.length; ++i) { 150 if (PROGRESS) System.out.println("SEGMENT"); 151 pieces[i] = getEquivalents(segmentList.get(i)); 152 } 153 } 154 155 /** 156 * Simple implementation of permutation. 157 * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b> 158 * @param source the string to find permutations for 159 * @param skipZeros set to true to skip characters with canonical combining class zero 160 * @param output the set to add the results to 161 * @internal 162 * @deprecated This API is ICU internal only. 163 */ 164 @Deprecated 165 public static void permute(String source, boolean skipZeros, Set<String> output) { 166 // TODO: optimize 167 //if (PROGRESS) System.out.println("Permute: " + source); 168 169 // optimization: 170 // if zero or one character, just return a set with it 171 // we check for length < 2 to keep from counting code points all the time 172 if (source.length() <= 2 && UTF16.countCodePoint(source) <= 1) { 173 output.add(source); 174 return; 175 } 176 177 // otherwise iterate through the string, and recursively permute all the other characters 178 Set<String> subpermute = new HashSet<String>(); 179 int cp; 180 for (int i = 0; i < source.length(); i += UTF16.getCharCount(cp)) { 181 cp = UTF16.charAt(source, i); 182 183 // optimization: 184 // if the character is canonical combining class zero, 185 // don't permute it 186 if (skipZeros && i != 0 && UCharacter.getCombiningClass(cp) == 0) { 187 //System.out.println("Skipping " + Utility.hex(UTF16.valueOf(source, i))); 188 continue; 189 } 190 191 // see what the permutations of the characters before and after this one are 192 subpermute.clear(); 193 permute(source.substring(0,i) 194 + source.substring(i + UTF16.getCharCount(cp)), skipZeros, subpermute); 195 196 // prefix this character to all of them 197 String chStr = UTF16.valueOf(source, i); 198 for (String s : subpermute) { 199 String piece = chStr + s; 200 //if (PROGRESS) System.out.println(" Piece: " + piece); 201 output.add(piece); 202 } 203 } 204 } 205 206 // FOR TESTING 207 208 /* 209 *@return the set of "safe starts", characters that are class zero AND are never non-initial in a decomposition. 210 * 211 public static UnicodeSet getSafeStart() { 212 return (UnicodeSet) SAFE_START.clone(); 213 } 214 */ 215 /* 216 *@return the set of characters whose decompositions start with the given character 217 * 218 public static UnicodeSet getStarts(int cp) { 219 UnicodeSet result = AT_START.get(cp); 220 if (result == null) result = EMPTY; 221 return (UnicodeSet) result.clone(); 222 } 223 */ 224 225 // ===================== PRIVATES ============================== 226 227 // debug 228 private static boolean PROGRESS = false; // debug progress 229 //private static Transliterator NAME = PROGRESS ? Transliterator.getInstance("name") : null; 230 private static boolean SKIP_ZEROS = true; 231 232 // fields 233 private final Normalizer2 nfd; 234 private final Normalizer2Impl nfcImpl; 235 private String source; 236 private boolean done; 237 private String[][] pieces; 238 private int[] current; 239 // Note: C will need two more fields, since arrays there don't have lengths 240 // int pieces_length; 241 // int[] pieces_lengths; 242 243 // transient fields 244 private transient StringBuilder buffer = new StringBuilder(); 245 246 247 // we have a segment, in NFD. Find all the strings that are canonically equivalent to it. 248 private String[] getEquivalents(String segment) { 249 Set<String> result = new HashSet<String>(); 250 Set<String> basic = getEquivalents2(segment); 251 Set<String> permutations = new HashSet<String>(); 252 253 // now get all the permutations 254 // add only the ones that are canonically equivalent 255 // TODO: optimize by not permuting any class zero. 256 Iterator<String> it = basic.iterator(); 257 while (it.hasNext()) { 258 String item = it.next(); 259 permutations.clear(); 260 permute(item, SKIP_ZEROS, permutations); 261 Iterator<String> it2 = permutations.iterator(); 262 while (it2.hasNext()) { 263 String possible = it2.next(); 264 265 /* 266 String attempt = Normalizer.normalize(possible, Normalizer.DECOMP, 0); 267 if (attempt.equals(segment)) { 268 */ 269 if (Normalizer.compare(possible, segment,0)==0) { 270 271 if (PROGRESS) System.out.println("Adding Permutation: " + Utility.hex(possible)); 272 result.add(possible); 273 274 } else { 275 if (PROGRESS) System.out.println("-Skipping Permutation: " + Utility.hex(possible)); 276 } 277 } 278 } 279 280 // convert into a String[] to clean up storage 281 String[] finalResult = new String[result.size()]; 282 result.toArray(finalResult); 283 return finalResult; 284 } 285 286 287 private Set<String> getEquivalents2(String segment) { 288 289 Set<String> result = new HashSet<String>(); 290 291 if (PROGRESS) System.out.println("Adding: " + Utility.hex(segment)); 292 293 result.add(segment); 294 StringBuffer workingBuffer = new StringBuffer(); 295 UnicodeSet starts = new UnicodeSet(); 296 297 // cycle through all the characters 298 int cp; 299 for (int i = 0; i < segment.length(); i += Character.charCount(cp)) { 300 301 // see if any character is at the start of some decomposition 302 cp = segment.codePointAt(i); 303 if (!nfcImpl.getCanonStartSet(cp, starts)) { 304 continue; 305 } 306 // if so, see which decompositions match 307 for(UnicodeSetIterator iter = new UnicodeSetIterator(starts); iter.next();) { 308 int cp2 = iter.codepoint; 309 Set<String> remainder = extract(cp2, segment, i, workingBuffer); 310 if (remainder == null) { 311 continue; 312 } 313 314 // there were some matches, so add all the possibilities to the set. 315 String prefix= segment.substring(0,i); 316 prefix += UTF16.valueOf(cp2); 317 for (String item : remainder) { 318 result.add(prefix + item); 319 } 320 } 321 } 322 return result; 323 /* 324 Set result = new HashSet(); 325 if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(segment)); 326 result.add(segment); 327 StringBuffer workingBuffer = new StringBuffer(); 328 329 // cycle through all the characters 330 int cp; 331 332 for (int i = 0; i < segment.length(); i += UTF16.getCharCount(cp)) { 333 // see if any character is at the start of some decomposition 334 cp = UTF16.charAt(segment, i); 335 NormalizerImpl.getCanonStartSet(c,fillSet) 336 UnicodeSet starts = AT_START.get(cp); 337 if (starts == null) continue; 338 UnicodeSetIterator usi = new UnicodeSetIterator(starts); 339 // if so, see which decompositions match 340 while (usi.next()) { 341 int cp2 = usi.codepoint; 342 // we know that there are no strings in it 343 // so we don't have to check CharacterIterator.IS_STRING 344 Set remainder = extract(cp2, segment, i, workingBuffer); 345 if (remainder == null) continue; 346 347 // there were some matches, so add all the possibilities to the set. 348 String prefix = segment.substring(0, i) + UTF16.valueOf(cp2); 349 Iterator it = remainder.iterator(); 350 while (it.hasNext()) { 351 String item = (String) it.next(); 352 if (PROGRESS) System.out.println("Adding: " + NAME.transliterate(prefix + item)); 353 result.add(prefix + item); 354 } 355 } 356 } 357 return result; 358 */ 359 } 360 361 /** 362 * See if the decomposition of cp2 is at segment starting at segmentPos 363 * (with canonical rearrangment!) 364 * If so, take the remainder, and return the equivalents 365 */ 366 private Set<String> extract(int comp, String segment, int segmentPos, StringBuffer buf) { 367 if (PROGRESS) System.out.println(" extract: " + Utility.hex(UTF16.valueOf(comp)) 368 + ", " + Utility.hex(segment.substring(segmentPos))); 369 370 String decomp = nfcImpl.getDecomposition(comp); 371 if (decomp == null) { 372 decomp = UTF16.valueOf(comp); 373 } 374 375 // See if it matches the start of segment (at segmentPos) 376 boolean ok = false; 377 int cp; 378 int decompPos = 0; 379 int decompCp = UTF16.charAt(decomp,0); 380 decompPos += UTF16.getCharCount(decompCp); // adjust position to skip first char 381 //int decompClass = getClass(decompCp); 382 buf.setLength(0); // initialize working buffer, shared among callees 383 384 for (int i = segmentPos; i < segment.length(); i += UTF16.getCharCount(cp)) { 385 cp = UTF16.charAt(segment, i); 386 if (cp == decompCp) { // if equal, eat another cp from decomp 387 if (PROGRESS) System.out.println(" matches: " + Utility.hex(UTF16.valueOf(cp))); 388 if (decompPos == decomp.length()) { // done, have all decomp characters! 389 buf.append(segment.substring(i + UTF16.getCharCount(cp))); // add remaining segment chars 390 ok = true; 391 break; 392 } 393 decompCp = UTF16.charAt(decomp, decompPos); 394 decompPos += UTF16.getCharCount(decompCp); 395 //decompClass = getClass(decompCp); 396 } else { 397 if (PROGRESS) System.out.println(" buffer: " + Utility.hex(UTF16.valueOf(cp))); 398 // brute force approach 399 UTF16.append(buf, cp); 400 /* TODO: optimize 401 // since we know that the classes are monotonically increasing, after zero 402 // e.g. 0 5 7 9 0 3 403 // we can do an optimization 404 // there are only a few cases that work: zero, less, same, greater 405 // if both classes are the same, we fail 406 // if the decomp class < the segment class, we fail 407 408 segClass = getClass(cp); 409 if (decompClass <= segClass) return null; 410 */ 411 } 412 } 413 if (!ok) return null; // we failed, characters left over 414 if (PROGRESS) System.out.println("Matches"); 415 if (buf.length() == 0) return SET_WITH_NULL_STRING; // succeed, but no remainder 416 String remainder = buf.toString(); 417 418 // brute force approach 419 // to check to make sure result is canonically equivalent 420 /* 421 String trial = Normalizer.normalize(UTF16.valueOf(comp) + remainder, Normalizer.DECOMP, 0); 422 if (!segment.regionMatches(segmentPos, trial, 0, segment.length() - segmentPos)) return null; 423 */ 424 425 if (0!=Normalizer.compare(UTF16.valueOf(comp) + remainder, segment.substring(segmentPos), 0)) return null; 426 427 // get the remaining combinations 428 return getEquivalents2(remainder); 429 } 430 431 /* 432 // TODO: fix once we have a codepoint interface to get the canonical combining class 433 // TODO: Need public access to canonical combining class in UCharacter! 434 private static int getClass(int cp) { 435 return Normalizer.getClass((char)cp); 436 } 437 */ 438 439 // ================= BUILDER ========================= 440 // TODO: Flatten this data so it doesn't have to be reconstructed each time! 441 442 //private static final UnicodeSet EMPTY = new UnicodeSet(); // constant, don't change 443 private static final Set<String> SET_WITH_NULL_STRING = new HashSet<String>(); // constant, don't change 444 static { 445 SET_WITH_NULL_STRING.add(""); 446 } 447 448 // private static UnicodeSet SAFE_START = new UnicodeSet(); 449 // private static CharMap AT_START = new CharMap(); 450 451 // TODO: WARNING, NORMALIZER doesn't have supplementaries yet !!; 452 // Change FFFF to 10FFFF in C, and in Java when normalizer is upgraded. 453 // private static int LAST_UNICODE = 0x10FFFF; 454 /* 455 static { 456 buildData(); 457 } 458 */ 459 /* 460 private static void buildData() { 461 462 if (PROGRESS) System.out.println("Getting Safe Start"); 463 for (int cp = 0; cp <= LAST_UNICODE; ++cp) { 464 if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.'); 465 int cc = UCharacter.getCombiningClass(cp); 466 if (cc == 0) SAFE_START.add(cp); 467 // will fix to be really safe below 468 } 469 if (PROGRESS) System.out.println(); 470 471 if (PROGRESS) System.out.println("Getting Containment"); 472 for (int cp = 0; cp <= LAST_UNICODE; ++cp) { 473 if (PROGRESS & (cp & 0x7FF) == 0) System.out.print('.'); 474 475 if (Normalizer.isNormalized(cp, Normalizer.NFD)) continue; 476 477 //String istr = UTF16.valueOf(cp); 478 String decomp = Normalizer.normalize(cp, Normalizer.NFD); 479 //if (decomp.equals(istr)) continue; 480 481 // add each character in the decomposition to canBeIn 482 483 int component; 484 for (int i = 0; i < decomp.length(); i += UTF16.getCharCount(component)) { 485 component = UTF16.charAt(decomp, i); 486 if (i == 0) { 487 AT_START.add(component, cp); 488 } else if (UCharacter.getCombiningClass(component) == 0) { 489 SAFE_START.remove(component); 490 } 491 } 492 } 493 if (PROGRESS) System.out.println(); 494 } 495 // the following is just for a map from characters to a set of characters 496 497 private static class CharMap { 498 Map storage = new HashMap(); 499 MutableInt probe = new MutableInt(); 500 boolean converted = false; 501 502 public void add(int cp, int whatItIsIn) { 503 UnicodeSet result = (UnicodeSet) storage.get(probe.set(cp)); 504 if (result == null) { 505 result = new UnicodeSet(); 506 storage.put(probe, result); 507 } 508 result.add(whatItIsIn); 509 } 510 511 public UnicodeSet get(int cp) { 512 return (UnicodeSet) storage.get(probe.set(cp)); 513 } 514 } 515 516 private static class MutableInt { 517 public int contents; 518 public int hashCode() { return contents; } 519 public boolean equals(Object other) { 520 return ((MutableInt)other).contents == contents; 521 } 522 // allows chaining 523 public MutableInt set(int contents) { 524 this.contents = contents; 525 return this; 526 } 527 } 528 */ 529 530 } 531